fix(ci): use AUTO_SYNC_TOKEN for auto-sync main->staging (Class D)

Same shape as molecule-controlplane#29: per-job GITHUB_TOKEN doesn't have the Gitea API permissions to open PRs / push branches the auto-sync flow needs. AUTO_SYNC_TOKEN is the devops-engineer persona PAT (per saved memory feedback_per_agent_gitea_identity_default). Companion prod ops (already done): - devops-engineer added as collaborator on molecule-core (write) - devops-engineer added to staging branch protection push_whitelist - AUTO_SYNC_TOKEN registered as Actions secret on molecule-core
Merge pull request 'chore(ci): retrigger staging CI on new runner image' (#25 ) from chore/retrigger-staging-on-fixed-runner-image into staging
2026-05-07 07:01:46 -07:00 · 2026-05-07 13:50:16 +00:00 · 2026-05-07 06:48:13 -07:00 · 2026-05-07 12:14:36 +00:00 · 2026-05-07 05:12:06 -07:00 · 2026-05-07 11:46:29 +00:00
106 changed files with 2316 additions and 8225 deletions
@@ -37,7 +37,7 @@ CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
 CONSUMERS: list[tuple[str, str]] = [
    (
        "molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh",
-        "https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh",
+        "https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime/raw/branch/main/molecule_runtime/scripts/pre-commit-checks.sh",
    ),
 ]

@@ -154,71 +154,30 @@ jobs:
            exit 0
          fi

-          # Upstream is publish-workspace-server-image. Check E2E state
-          # for the same SHA via Gitea's commit-status API.
-          #
-          # GitHub-era this was `gh run list --workflow=X --commit=SHA
-          # --json status,conclusion` returning either `[]` (no run on
-          # this SHA) or `[{status, conclusion}]` (the run's state).
-          # Gitea has NO workflow-runs API at all — `/api/v1/repos/.../
-          # actions/runs` returns 404 (verified 2026-05-07, issue #75).
-          # However Gitea Actions DOES emit a commit status per workflow
-          # job, with `context = "<Workflow Name> / <Job Name> (<event>)"`,
-          # which is exactly what we need: each E2E run leg becomes one
-          # status row on the SHA, and the aggregate state encodes the
-          # run's outcome.
-          #
-          # Mapping:
-          #   0 matched contexts          → "none/none"      (E2E paths-
-          #                                                    filtered
-          #                                                    out — same
-          #                                                    semantic
-          #                                                    as before)
-          #   any context = pending       → "in_progress/none" (defer)
-          #   any context = error|failure → "completed/failure" (abort)
-          #   all contexts = success      → "completed/success" (proceed)
-          #
-          # The "completed/cancelled" and "completed/timed_out" buckets
-          # don't have direct Gitea analogs (Gitea statuses are
-          # success / failure / error / pending / warning). Per-SHA
-          # concurrency cancellation surfaces as `error` on Gitea, which
-          # we map to "completed/failure" rather than "completed/cancelled"
-          # — losing the soft-defer semantic of the cancelled bucket on
-          # this fleet. Tradeoff: the staleness alarm (auto-promote-stale-
-          # alarm.yml) still catches a stuck :latest within 4h, and a
-          # legitimate cancel is rare enough that aborting + manual
-          # re-dispatch is acceptable. If we measure cancel frequency
-          # > 1/week, revisit by reading the run-step-summary text via
-          # a follow-up script.
-          #
-          # Network or auth blips collapse to "none/none" via the curl
-          # `|| true` fallback, matching the pre-Gitea behaviour where
-          # an empty list also degenerated to none/none.
-          GITEA_API_URL="${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1"
-          STATUSES_JSON=$(curl --fail-with-body -sS \
-            -H "Authorization: token ${GH_TOKEN}" \
-            -H "Accept: application/json" \
-            "${GITEA_API_URL}/repos/${REPO}/commits/${SHA}/statuses?limit=100" \
-            2>/dev/null || echo "[]")
-          RESULT=$(printf '%s' "$STATUSES_JSON" | jq -r '
-            # Filter to E2E Staging SaaS (full lifecycle) statuses.
-            # Match by leading workflow-name prefix so the "<job>
-            # (<event>)" tail is irrelevant. Gitea emits the workflow
-            # name verbatim from the YAML `name:` field.
-            [.[] | select(.context | startswith("E2E Staging SaaS (full lifecycle) /"))] as $rows
-            | if ($rows | length) == 0 then
-                "none/none"
-              elif any($rows[]; .status == "pending") then
-                "in_progress/none"
-              elif any($rows[]; .status == "failure" or .status == "error") then
-                "completed/failure"
-              elif all($rows[]; .status == "success") then
-                "completed/success"
-              else
-                # Mixed / unknown — fall through to *) bucket below.
-                "completed/" + ($rows[0].status // "unknown")
-              end
-          ' 2>/dev/null || echo "none/none")
+          # Upstream is publish-workspace-server-image. Check E2E state.
+          # The jq filter must defend against TWO empty cases that gh
+          # CLI emits indistinguishably:
+          #   1. gh exits non-zero (network blip, auth issue) → handled
+          #      by the `|| echo "none/none"` fallback below.
+          #   2. gh exits zero but returns `[]` (no E2E run on this
+          #      main SHA — the common case for canvas-only / cmd-only
+          #      / sweep-only changes whose paths don't trigger E2E).
+          #      Without `(.[0] // {})`, jq sees `null` and emits
+          #      "null/none" — which the case statement below has no
+          #      branch for, so it falls into *) → exit 1.
+          # Surfaced 2026-04-30 the first time the App-token chain
+          # (#2389) actually fired auto-promote-on-e2e from a publish
+          # upstream — every prior run was E2E-upstream which
+          # short-circuits before this gate.
+          RESULT=$(gh run list \
+            --repo "$REPO" \
+            --workflow e2e-staging-saas.yml \
+            --branch main \
+            --commit "$SHA" \
+            --limit 1 \
+            --json status,conclusion \
+            --jq '(.[0] // {}) | "\(.status // "none")/\(.conclusion // "none")"' \
+            2>/dev/null || echo "none/none")

          echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT"

@@ -240,13 +199,16 @@ jobs:
              exit 1
              ;;
            completed/cancelled)
-              # GitHub-era only: cancelled ≠ failure. Gitea statuses
-              # don't expose a "cancelled" state — a per-SHA concurrency
-              # cancellation surfaces as `failure` or `error` on Gitea
-              # and is now handled by the failure branch above. This
-              # arm is kept for backwards compatibility / dual-host
-              # operation (if we ever add a non-Gitea fallback) but
-              # under the post-#75 flow it's unreachable.
+              # cancelled ≠ failure. Per-SHA concurrency cancels older E2E
+              # runs when a newer push lands (memory:
+              # feedback_concurrency_group_per_sha) — the newer SHA will
+              # have its own E2E + promote chain. Treat the same as
+              # in_progress: defer without aborting, let the next E2E run
+              # promote when it lands.
+              #
+              # Caught 2026-05-05 02:03 on sha 31f9a5e — auto-promote
+              # blocked the whole chain because this case fell through to
+              # exit 1 instead of clean defer.
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled"
@@ -2,148 +2,61 @@ name: Auto-promote staging → main

 # Fires after any of the staging-branch quality gates complete. When ALL
 # required gates are green on the same staging SHA, opens (or re-uses)
-# a PR `staging → main` and schedules Gitea auto-merge so the PR lands
-# automatically once approval + status checks are satisfied.
+# a PR `staging → main` and enables auto-merge so the merge queue lands
+# it. Closes the gap that historically let features sit on staging for
+# weeks waiting for a bulk promotion PR (see molecule-core#1496 for the
+# 1172-commit example).
 #
-# ============================================================
-# What this workflow does
-# ============================================================
+# 2026-04-28 rewrite (PR #142): the previous version did a direct
+# `git merge --ff-only origin staging && git push origin main`. That
+# breaks against main's branch-protection ruleset, which requires
+# status checks "set by the expected GitHub apps" — direct pushes
+# can't satisfy that condition (only PR merges through the queue can).
+# The workflow was failing every tick with:
+#   remote: error: GH006: Protected branch update failed for refs/heads/main.
+#   remote: - Required status checks ... were not set by the expected GitHub apps.
+# Fix: mirror the PR-based pattern from auto-sync-main-to-staging.yml
+# (the reverse-direction sync, fixed in #2234 for the same reason).
+# Both directions now use the same merge-queue path that humans use,
+# no special-case bypass.
 #
-# 1. On a workflow_run completion event for one of the staging gate
-#    workflows (CI, E2E Staging Canvas, E2E API Smoke, CodeQL),
-#    checks if the combined status on the staging head SHA is green.
-# 2. If green, opens (or re-uses) a PR `head: staging → base: main`
-#    via Gitea REST `POST /api/v1/repos/.../pulls`.
-# 3. Schedules auto-merge via `POST /api/v1/repos/.../pulls/{index}/merge`
-#    with `merge_when_checks_succeed: true`. Gitea waits for the
-#    approval requirement on `main` (`required_approvals: 1`) and
-#    the status-check gates, then merges.
-# 4. The merge commit lands on `main` and fires
-#    `publish-workspace-server-image.yml` naturally via its
-#    `on: push: branches: [main]` trigger — no explicit dispatch
-#    needed (see "Why no workflow_dispatch tail" below).
+# Safety model:
+# - Runs ONLY on workflow_run events for the staging branch.
+# - Requires EVERY named gate workflow to have the same head_sha and
+#   all be `conclusion == success`. If any of them is red, skipped,
+#   cancelled, or pending, we abort (stay on the current main).
+# - The PR base=main head=staging path lets GitHub itself enforce
+#   branch protection. If main has diverged from staging or required
+#   checks aren't satisfied, the merge queue declines the PR — no
+#   need for a manual ff-only ancestry check here.
+# - Loop safety: the auto-sync-main-to-staging workflow fires when
+#   main lands the auto-promote PR, but its merge into staging is by
+#   GITHUB_TOKEN which doesn't trigger downstream workflow_run events
+#   (GitHub Actions safety). So this workflow doesn't re-fire from
+#   its own promote landing.
 #
-# `auto-sync-main-to-staging.yml` is the reverse-direction
-# counterpart (main → staging, fast-forward push). Together they
-# keep the staging-superset-of-main invariant tight.
+# Toggle via repo variable AUTO_PROMOTE_ENABLED (true/unset). When
+# unset, the workflow logs what it would have done but doesn't open
+# the PR — useful for dry-running the gate logic without surfacing
+# a noisy PR while staging CI is still flaky.
 #
-# ============================================================
-# Why Gitea REST (and not `gh pr create`)
-# ============================================================
+# **One-time repo setting (load-bearing):** this workflow opens the
+# staging→main PR via `gh pr create` using the default GITHUB_TOKEN.
+# Since GitHub's 2022 default change, that token cannot create or
+# approve PRs unless the repo opts in. The toggle is at:
 #
-# Pre-2026-05-06 this workflow used `gh pr create`, `gh pr merge --auto`,
-# `gh run list`, and `gh workflow run` against GitHub. After the
-# GitHub→Gitea cutover those calls fail because:
+#   Settings → Actions → General → Workflow permissions
+#   → ✅ Allow GitHub Actions to create and approve pull requests
 #
-#   - `gh pr create / merge / view / list` route to GitHub GraphQL
-#     (`/api/graphql`). Gitea does not expose a GraphQL endpoint;
-#     every call returns `HTTP 405 Method Not Allowed` — same root
-#     cause as #65 (auto-sync) which PR #66 fixed by dropping `gh`
-#     entirely.
-#   - `gh run list --workflow=...` GitHub-shape; Gitea has the
-#     simpler `GET /repos/.../commits/{ref}/status` combined-status
-#     endpoint instead.
-#   - `gh workflow run X.yml` calls `POST /repos/.../actions/workflows/{id}/dispatches`,
-#     which does NOT exist on Gitea 1.22.6 (verified via swagger.v1.json).
+# Without it, every workflow_run fails with:
 #
-# So this workflow uses direct `curl` calls to Gitea REST. No `gh`
-# CLI dependency, no GraphQL, no missing-endpoint footgun.
+#   pull request create failed: GraphQL: GitHub Actions is not
+#   permitted to create or approve pull requests (createPullRequest)
 #
-# ============================================================
-# Why no workflow_dispatch tail (was load-bearing on GitHub, dead on Gitea)
-# ============================================================
-#
-# The GitHub-era version had a 60-line polling step that waited for
-# the promote PR to merge, then explicitly dispatched
-# `publish-workspace-server-image.yml` on `--ref main`. That step
-# existed because GitHub's GITHUB_TOKEN-initiated merges suppress
-# downstream `on: push` workflows (the documented "no recursion" rule
-# — https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
-# The explicit dispatch was the workaround.
-#
-# Gitea Actions does NOT have this no-recursion rule. PR #66's auto-
-# sync merge to main fired `auto-promote-staging` on the next push
-# trigger naturally. So the cascade fires on the natural push event;
-# the explicit dispatch is dead code. (And even if we wanted to
-# preserve it, Gitea has no `workflow_dispatch` REST endpoint.)
-#
-# Removed in this rewrite. If we ever observe the cascade misfire,
-# operator can push an empty commit to `main` to wake it.
-#
-# ============================================================
-# Why open a PR (and not direct push)
-# ============================================================
-#
-# `main` branch protection has `enable_push: false` with NO
-# `push_whitelist_usernames`. Direct push is impossible for any
-# persona, including admins. PR-mediated merge is the only path,
-# which is intentional: prod state mutations (and staging→main IS a
-# prod mutation, since the next deploy fans out to tenants) require
-# Hongming's approval per `feedback_prod_apply_needs_hongming_chat_go`.
-#
-# The auto-merge schedule preserves this gate: `merge_when_checks_succeed`
-# does NOT bypass `required_approvals: 1`. Gitea waits for BOTH
-# approval AND green checks before merging. Hongming reviews via the
-# canvas/chat-handle of the PR notification, approves, and Gitea
-# auto-merges within seconds.
-#
-# ============================================================
-# Identity + token (anti-bot-ring per saved-memory
-# `feedback_per_agent_gitea_identity_default`)
-# ============================================================
-#
-# This workflow uses `secrets.AUTO_SYNC_TOKEN` — a personal access
-# token issued to the `devops-engineer` Gitea persona. NOT the
-# founder PAT. The bot-ring fingerprint that triggered the GitHub
-# org suspension on 2026-05-06 was characterised by founder PAT
-# acting as CI at machine speed.
-#
-# Token scope: `push: true` (read+write) on this repo. The persona
-# can: open PRs, comment on PRs, schedule auto-merge. The persona
-# CANNOT bypass main's branch protection (`required_approvals: 1`
-# still applies — only Hongming's review unblocks merge).
-#
-# Authorship: the PR is opened by `devops-engineer`; the merge
-# commit credits Hongming-as-approver and `devops-engineer` as
-# the merger.
-#
-# ============================================================
-# Failure modes & operational notes
-# ============================================================
-#
-# A — staging gates not all green at trigger time:
-#     - The combined-status check returns `state: pending|failure`.
-#       Workflow exits 0 with a step-summary "not all green; staying
-#       on current main". Re-fires on the next gate completion.
-#
-# B — Gitea PR-create returns non-201 (e.g. 422 already-exists):
-#     - Idempotent: the workflow first GETs the existing open
-#       staging→main PR. If found, reuse it; if not, POST a new one.
-#       422 should never surface; if it does (race), step summary
-#       captures the body and the next workflow_run picks up.
-#
-# C — `merge_when_checks_succeed` schedule fails:
-#     - 422 with "Pull request is not mergeable" if there are
-#       conflicts or stale base. Step summary surfaces it; operator
-#       (or `auto-sync-main-to-staging`) needs to bring staging up
-#       to date with main first. Workflow exits 1 to surface red.
-#
-# D — `AUTO_SYNC_TOKEN` rotated / wrong scope:
-#     - 401/403 on first REST call. Step summary surfaces it.
-#       Re-issue the token from `~/.molecule-ai/personas/` on the
-#       operator host and update the repo Actions secret.
-#
-# ============================================================
-# Loop safety
-# ============================================================
-#
-# When the promote PR merges to main, `auto-sync-main-to-staging.yml`
-# fires (on:push:main) and pushes the merge commit back to staging.
-# That push to staging is by `devops-engineer`, NOT this workflow's
-# token, and triggers the staging gate workflows. When they all
-# complete, we end up back here — but the tree-diff guard catches
-# it: staging tree == main tree (the merge commit changes nothing),
-# so we skip and the cycle terminates.
+# Observed 2026-04-29 01:43 UTC blocking promotion of fcd87b9 (PRs
+# #2248 + #2249); manually bridged via PR #2252. Re-check this
+# setting if auto-promote starts failing with createPullRequest
+# errors after a repo or org admin change.

 on:
  workflow_run:
@@ -161,16 +74,26 @@ on:
        default: "false"

 permissions:
-  contents: read
+  contents: write
  pull-requests: write
+  # actions: write is needed by the post-merge dispatch tail step
+  # (#2358 / #2357) — `gh workflow run publish-workspace-server-image.yml`
+  # POSTs to /actions/workflows/.../dispatches which requires this scope.
+  # Without it the call 403s and the publish/canary/redeploy chain still
+  # doesn't run on staging→main promotions, undoing #2358.
+  actions: write

 # Serialize auto-promote runs. Multiple staging gate completions can land
 # in quick succession (CI + E2E + CodeQL all finish within seconds of
 # each other on a green PR) — without this, two parallel runs both:
-#   1. Would race the GET-or-POST PR step.
-#   2. Would both call merge-schedule (idempotent — fine on Gitea).
-# cancel-in-progress: false because the second run on a fresh staging
-# tip should NOT kill the first which has already opened the PR.
+#   1. Open / re-use the same promote PR.
+#   2. Both call `gh pr merge --auto` (idempotent — fine).
+#   3. Both poll for the same mergedAt and both `gh workflow run` publish
+#      → 2× redundant publish builds racing for the same `:staging-latest`
+#      retag, and 2× canary-verify chains.
+# cancel-in-progress: false because we don't want a brand-new run to kill
+# a polling-tail that's about to dispatch — the polling tail's 30 min cap
+# is the right backstop, not workflow-level cancel.
 concurrency:
  group: auto-promote-staging
  cancel-in-progress: false
@@ -188,112 +111,126 @@ jobs:
      all_green: ${{ steps.gates.outputs.all_green }}
      head_sha: ${{ steps.gates.outputs.head_sha }}
    steps:
-      # Skip empty-tree promotes (the perpetual auto-promote↔auto-sync
-      # cycle observed pre-cutover on GitHub). On Gitea the cycle shape
-      # is different (auto-sync uses fast-forward, no merge commit),
-      # but the tree-diff guard is cheap insurance and protects against
-      # any future merge-style regression.
+      # Skip empty-tree promotes (the perpetual auto-promote↔auto-sync cycle
+      # observed 2026-05-03). Sequence: auto-promote merges via the staging
+      # merge-queue's MERGE strategy, creating a merge commit on main that
+      # staging doesn't have. auto-sync then merges main back into staging
+      # via another merge commit (the queue's MERGE strategy applies on
+      # the staging side too, even when the workflow's local FF would
+      # have sufficed). Now staging has a new merge-commit SHA whose
+      # tree == main's tree — but auto-promote sees "staging ahead of
+      # main by 1" and opens YET another empty promote PR. Each round
+      # costs ~30-40 min wallclock, ~2 manual approvals, and burns a
+      # full CodeQL Go run (~15 min). Without this guard the cycle
+      # repeats indefinitely.
+      #
+      # Long-term fix is to switch the merge_queue ruleset's
+      # `merge_method` away from MERGE so FF-able PRs land cleanly,
+      # but that's a broader change affecting every staging PR's
+      # commit shape. This guard is the one-line surgical fix that
+      # breaks the cycle without touching merge-queue config.
+      #
+      # Fail-open: if `git diff` errors for any reason, fall through
+      # to the gate check (preserve existing behavior). Only skip
+      # when the diff is DEFINITIVELY empty.
      - name: Checkout for tree-diff check
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          ref: staging
-
-      - name: Skip if staging tree == main tree (cycle-break safety)
+      - name: Skip if staging tree == main tree (perpetual-cycle break)
        id: tree-diff
        env:
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
        run: |
          set -eu
          git fetch origin main --depth=50 || { echo "::warning::git fetch main failed — proceeding (fail-open)"; exit 0; }
+          # Compare staging tip's tree against main's tree. `git diff
+          # --quiet` exits 0 if no differences, 1 if there are.
          if git diff --quiet origin/main "$HEAD_SHA" -- 2>/dev/null; then
            {
-              echo "## Skipped — no code to promote"
+              echo "## ⏭ Skipped — no code to promote"
              echo
              echo "staging tip (\`${HEAD_SHA:0:8}\`) and \`main\` have identical trees."
-              echo "Skipping to avoid opening an empty promote PR."
+              echo "This is the auto-promote↔auto-sync merge-commit cycle: staging has a"
+              echo "new SHA (a sync-back merge commit) but the underlying file tree is"
+              echo "already on main, so there's no real code to ship."
+              echo
+              echo "Skipping to avoid opening an empty promote PR. Cycle terminates here."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::notice::auto-promote: staging tree == main tree — no code to promote, skipping"
            echo "skip=true" >> "$GITHUB_OUTPUT"
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi
-
-      - name: Check combined status on staging head
+      - name: Check all required gates on this SHA
        if: steps.tree-diff.outputs.skip != 'true'
        id: gates
        env:
-          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          REPO: ${{ github.repository }}
-          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
        run: |
          set -euo pipefail

-          # Gitea-native combined-status endpoint aggregates every
-          # check context attached to a SHA. This is structurally
-          # cleaner than the GitHub-era per-workflow `gh run list`
-          # loop because:
+          # Required gate workflow files. Use file paths (relative to
+          # .github/workflows/) rather than display names because:
          #
-          #   1. There's no risk of "workflow name collision" (the
-          #      GitHub-era code had to switch from `--workflow=NAME`
-          #      to `--workflow=FILE.YML` to disambiguate "CodeQL"
-          #      between the explicit workflow and GitHub's UI-
-          #      configured default setup; Gitea has no such
-          #      duplicate-name surface).
-          #   2. Gitea's combined state already encodes the AND
-          #      across all contexts: success only if EVERY context
-          #      is success. Pending or failure on any context
-          #      produces non-success state.
+          #   1. `gh run list --workflow=<name>` is ambiguous when two
+          #      workflows have the same `name:` — observed 2026-04-28
+          #      with "CodeQL" matching both `codeql.yml` (explicit) and
+          #      GitHub's UI-configured Code-quality default setup
+          #      (internal "codeql"). gh CLI returns "could not resolve
+          #      to a unique workflow" → empty result → gate evaluated
+          #      as missing/none → auto-promote dead-locked despite all
+          #      checks actually passing.
          #
-          # See https://docs.gitea.com/api/1.22 for the schema —
-          # `state` is one of: success, pending, failure, error.
+          #   2. File paths are the unique identifier for workflows;
+          #      `name:` is just a display string and can collide.
+          #
+          # When adding/removing a gate, update this list AND the
+          # branch-protection required-checks list (which uses check-run
+          # display names, not workflow names; the two are decoupled and
+          # should be kept in sync manually).
+          GATES=(
+            "ci.yml"
+            "e2e-staging-canvas.yml"
+            "e2e-api.yml"
+            "codeql.yml"
+          )

          echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT"
-          echo "Checking combined status on SHA ${HEAD_SHA}"
+          echo "Checking gates on SHA ${HEAD_SHA}"

-          # `set +o pipefail` for the http-code capture pattern; restore
-          # immediately. Pattern hardened per `feedback_curl_status_capture_pollution`.
-          BODY_FILE=$(mktemp)
-          set +e
-          STATUS=$(curl -sS \
-            -H "Authorization: token ${GITEA_TOKEN}" \
-            -H "Accept: application/json" \
-            -o "${BODY_FILE}" \
-            -w "%{http_code}" \
-            "${GITEA_HOST}/api/v1/repos/${REPO}/commits/${HEAD_SHA}/status")
-          CURL_RC=$?
-          set -e
+          ALL_GREEN=true
+          for gate in "${GATES[@]}"; do
+            # Query the most recent run of this workflow on this SHA.
+            # event=push to avoid picking up PR runs. branch=staging to
+            # guard against someone dispatching the gate on a non-staging
+            # branch at the same SHA.
+            RESULT=$(gh run list \
+              --repo "$REPO" \
+              --workflow "$gate" \
+              --branch staging \
+              --event push \
+              --commit "$HEAD_SHA" \
+              --limit 1 \
+              --json status,conclusion \
+              --jq '.[0] | "\(.status)/\(.conclusion // "none")"' \
+              2>/dev/null || echo "missing/none")

-          if [ "${CURL_RC}" -ne 0 ] || [ "${STATUS}" != "200" ]; then
-            echo "::error::combined-status fetch failed: curl=${CURL_RC} http=${STATUS}"
-            cat "${BODY_FILE}" | head -c 500 || true
-            rm -f "${BODY_FILE}"
-            echo "all_green=false" >> "$GITHUB_OUTPUT"
-            exit 0
-          fi
+            echo "  $gate → $RESULT"

-          STATE=$(jq -r '.state // "missing"' < "${BODY_FILE}")
-          TOTAL=$(jq -r '.total_count // 0' < "${BODY_FILE}")
-          rm -f "${BODY_FILE}"
+            # Only completed/success counts. completed/failure or
+            # in_progress/anything or no record at all = abort.
+            if [ "$RESULT" != "completed/success" ]; then
+              ALL_GREEN=false
+            fi
+          done

-          echo "Combined status: state=${STATE} total_count=${TOTAL}"
-
-          if [ "${STATE}" = "success" ] && [ "${TOTAL}" -gt 0 ]; then
-            echo "all_green=true" >> "$GITHUB_OUTPUT"
-            echo "::notice::All gates green on ${HEAD_SHA} (${TOTAL} contexts)"
-          else
-            echo "all_green=false" >> "$GITHUB_OUTPUT"
-            {
-              echo "## Not promoting — combined status not green"
-              echo
-              echo "- SHA: \`${HEAD_SHA:0:8}\`"
-              echo "- Combined state: \`${STATE}\`"
-              echo "- Context count: ${TOTAL}"
-              echo
-              echo "Will re-fire on the next gate completion. Investigate any red gate via the Actions UI."
-            } >> "$GITHUB_STEP_SUMMARY"
-            echo "::notice::auto-promote: combined status is ${STATE} on ${HEAD_SHA} — staying on current main"
+          echo "all_green=${ALL_GREEN}" >> "$GITHUB_OUTPUT"
+          if [ "$ALL_GREEN" != "true" ]; then
+            echo "::notice::auto-promote: not all gates are green on ${HEAD_SHA} — staying on current main"
          fi

  promote:
@@ -310,183 +247,188 @@ jobs:
          # Repo variable AUTO_PROMOTE_ENABLED=true flips this on. While
          # it's unset, the workflow dry-runs (logs what it would have
          # done) but doesn't open the promote PR. Set the variable in
-          # Settings → Actions → Variables.
+          # Settings → Secrets and variables → Actions → Variables.
          if [ "${AUTO_PROMOTE_ENABLED:-}" != "true" ] && [ "${FORCE_INPUT:-false}" != "true" ]; then
            {
-              echo "## Auto-promote disabled"
+              echo "## ⏸ Auto-promote disabled"
              echo
              echo "Repo variable \`AUTO_PROMOTE_ENABLED\` is not set to \`true\`."
              echo "All gates are green on staging; would have opened a promote PR to \`main\`."
              echo
-              echo "To enable: Settings → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`."
+              echo "To enable: Settings → Secrets and variables → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`."
              echo "To test once manually: workflow_dispatch with \`force=true\`."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::notice::auto-promote disabled — dry run only"
            exit 0
          fi

-      - name: Open or reuse promote PR + schedule auto-merge
+      # Mint the App token BEFORE the promote-PR step so the auto-merge
+      # call can use it. GITHUB_TOKEN-initiated merges suppress the
+      # downstream `push` event on main, breaking the
+      # publish-workspace-server-image → canary-verify → redeploy-tenants
+      # chain (issue #2357). Using the App token here means the
+      # merge-queue-landed merge IS able to fire the cascade naturally;
+      # the polling tail below stays as defense-in-depth.
+      - name: Mint App token for promote-PR + downstream dispatch
+        if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
+        id: app-token
+        uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1
+        with:
+          app-id: ${{ secrets.MOLECULE_AI_APP_ID }}
+          private-key: ${{ secrets.MOLECULE_AI_APP_PRIVATE_KEY }}
+
+      - name: Open (or reuse) staging → main promote PR + enable auto-merge
        if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
        env:
-          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
+          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          REPO: ${{ github.repository }}
          TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }}
-          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
        run: |
          set -euo pipefail

-          API="${GITEA_HOST}/api/v1/repos/${REPO}"
-          AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
+          # Look for an existing open promote PR (idempotent on re-run
+          # of the workflow). The PR's head IS the staging branch — the
+          # whole point is "advance main to staging's tip", so we don't
+          # need a per-SHA branch like auto-sync-main-to-staging uses.
+          PR_NUM=$(gh pr list --repo "$REPO" \
+            --base main --head staging --state open \
+            --json number --jq '.[0].number // ""')

-          # http_status_get RESULT_VAR URL
-          # Sets RESULT_VAR to "<http_code>:<body_file>". Curl status
-          # capture pattern per `feedback_curl_status_capture_pollution`:
-          # http_code goes to its own tempfile-equivalent (-w), body to
-          # another tempfile, set +e/-e bracket protects pipeline state.
-          http_get() {
-            local body_file="$1"; shift
-            local url="$1"; shift
-            set +e
-            local code
-            code=$(curl -sS "${AUTH[@]}" -o "${body_file}" -w "%{http_code}" "${url}")
-            local rc=$?
-            set -e
-            if [ "${rc}" -ne 0 ]; then
-              echo "::error::curl GET failed (rc=${rc}) on ${url}"
-              return 99
-            fi
-            echo "${code}"
-          }
-          http_post_json() {
-            local body_file="$1"; shift
-            local data="$1"; shift
-            local url="$1"; shift
-            set +e
-            local code
-            code=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
-              -X POST -d "${data}" -o "${body_file}" -w "%{http_code}" "${url}")
-            local rc=$?
-            set -e
-            if [ "${rc}" -ne 0 ]; then
-              echo "::error::curl POST failed (rc=${rc}) on ${url}"
-              return 99
-            fi
-            echo "${code}"
-          }
-
-          # Step 1: look for an existing open staging→main promote PR
-          # (idempotent on workflow re-run). Gitea doesn't have a
-          # head/base filter on the list endpoint that's as ergonomic
-          # as gh's, but the dedicated `/pulls/{base}/{head}` lookup
-          # works.
-          BODY=$(mktemp)
-          STATUS=$(http_get "${BODY}" "${API}/pulls/main/staging") || true
-
-          PR_NUM=""
-          if [ "${STATUS}" = "200" ]; then
-            STATE=$(jq -r '.state // "missing"' < "${BODY}")
-            if [ "${STATE}" = "open" ]; then
-              PR_NUM=$(jq -r '.number // ""' < "${BODY}")
-              echo "::notice::Re-using existing open promote PR #${PR_NUM}"
-            fi
-          fi
-          rm -f "${BODY}"
-
-          # Step 2: if no open PR, create one.
-          if [ -z "${PR_NUM}" ]; then
+          if [ -z "$PR_NUM" ]; then
            TITLE="staging → main: auto-promote ${TARGET_SHA:0:7}"
-            BODY_TEXT=$(cat <<EOFBODY
-          Automated promotion of \`staging\` (\`${TARGET_SHA:0:8}\`) to \`main\`. All required staging gates are green at this SHA (combined status reported success).
+            BODY_FILE=$(mktemp)
+            cat > "$BODY_FILE" <<EOFBODY
+          Automated promotion of \`staging\` (\`${TARGET_SHA:0:8}\`) to \`main\`. All required staging gates green at this SHA: CI, E2E Staging Canvas, E2E API Smoke, CodeQL.

-          This PR is auto-generated by \`.github/workflows/auto-promote-staging.yml\` whenever every required gate completes green on the same staging SHA.
+          This PR is auto-generated by \`.github/workflows/auto-promote-staging.yml\` whenever every required gate completes green on the same staging SHA. It exists because main's branch protection requires status checks "set by the expected GitHub apps" — direct \`git push\` from a workflow can't satisfy that, only PR merges through the queue can.

-          **Approval gate:** \`main\` branch protection requires 1 approval before this can land. Once approved, Gitea will auto-merge (the workflow scheduled \`merge_when_checks_succeed: true\` immediately after open).
-
-          The reverse-direction sync (the merge commit on \`main\` → \`staging\`) is handled automatically by \`auto-sync-main-to-staging.yml\` after this PR lands.
-
-          ---
-          - Source: staging at \`${TARGET_SHA}\`
-          - Opened by: \`devops-engineer\` persona (anti-bot-ring; never founder PAT)
-          - Refs: #65, #73, #195
+          Merge queue lands this; no human action needed unless gates fail. Reverse-direction sync (the merge commit on main → staging) is handled by \`auto-sync-main-to-staging.yml\`.
          EOFBODY
-          )
-            REQ=$(jq -n \
-              --arg title "${TITLE}" \
-              --arg body "${BODY_TEXT}" \
-              --arg base "main" \
-              --arg head "staging" \
-              '{title:$title, body:$body, base:$base, head:$head}')
-
-            BODY=$(mktemp)
-            STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls")
-
-            if [ "${STATUS}" = "201" ]; then
-              PR_NUM=$(jq -r '.number // ""' < "${BODY}")
-              echo "::notice::Opened promote PR #${PR_NUM}"
-            else
-              echo "::error::Failed to create promote PR: HTTP ${STATUS}"
-              jq -r '.message // .' < "${BODY}" | head -c 500
-              rm -f "${BODY}"
-              exit 1
-            fi
-            rm -f "${BODY}"
+            PR_URL=$(gh pr create --repo "$REPO" \
+              --base main --head staging \
+              --title "$TITLE" \
+              --body-file "$BODY_FILE")
+            PR_NUM=$(echo "$PR_URL" | grep -oE '[0-9]+$' | tail -1)
+            rm -f "$BODY_FILE"
+            echo "::notice::Opened PR #${PR_NUM}"
+          else
+            echo "::notice::Re-using existing promote PR #${PR_NUM}"
          fi

-          # Step 3: schedule auto-merge. merge_when_checks_succeed
-          # tells Gitea to wait for both:
-          #   - all required status checks to pass
-          #   - the required-approvals gate (1 approval on main)
-          # before merging. On approval+green, Gitea merges within
-          # seconds. On any check failing or approval being denied,
-          # the schedule stays armed but doesn't fire.
-          #
-          # Idempotent: re-arming on an already-armed PR is a no-op.
-          REQ=$(jq -n '{Do:"merge", merge_when_checks_succeed:true}')
-          BODY=$(mktemp)
-          STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls/${PR_NUM}/merge")
-
-          # Gitea returns:
-          #   - 200/204 on successful immediate merge (gates already green AND approved)
-          #   - 405 "Please try again later" when scheduled successfully but waiting
-          #   - 422 on "Pull request is not mergeable" (conflict, stale base, etc.)
-          #
-          # 405 here is benign — Gitea's way of saying "scheduled, not merging now".
-          # We treat 200/204/405 as success, anything else as failure.
-          case "${STATUS}" in
-            200|204)
-              MERGE_OUTCOME="merged-immediately"
-              echo "::notice::Promote PR #${PR_NUM} merged immediately (gates+approval already green)"
-              ;;
-            405)
-              MERGE_OUTCOME="auto-merge-scheduled"
-              echo "::notice::Promote PR #${PR_NUM}: auto-merge scheduled (Gitea will land on approval+green)"
-              ;;
-            422)
-              MERGE_OUTCOME="not-mergeable"
-              echo "::warning::Promote PR #${PR_NUM}: not mergeable (conflict, stale base, or already merging)."
-              jq -r '.message // .' < "${BODY}" | head -c 500
-              ;;
-            *)
-              echo "::error::Unexpected status ${STATUS} on merge schedule"
-              jq -r '.message // .' < "${BODY}" | head -c 500
-              rm -f "${BODY}"
-              exit 1
-              ;;
-          esac
-          rm -f "${BODY}"
+          # Enable auto-merge — the merge queue picks it up once
+          # required gates are green on the merge_group ref.
+          if ! gh pr merge "$PR_NUM" --repo "$REPO" --auto --merge 2>&1; then
+            echo "::warning::Failed to enable auto-merge on PR #${PR_NUM} — operator may need to merge manually."
+          fi

          {
-            echo "## Auto-promote PR opened"
+            echo "## ✅ Auto-promote PR opened"
            echo
            echo "- Source: staging at \`${TARGET_SHA:0:8}\`"
            echo "- PR: #${PR_NUM}"
-            echo "- Outcome: \`${MERGE_OUTCOME}\`"
            echo
-            if [ "${MERGE_OUTCOME}" = "auto-merge-scheduled" ]; then
-              echo "Gitea will auto-merge once Hongming approves and all checks are green. No human action needed beyond approval."
-            elif [ "${MERGE_OUTCOME}" = "merged-immediately" ]; then
-              echo "Merged immediately. \`publish-workspace-server-image.yml\` will fire naturally on the resulting \`main\` push."
-            else
-              echo "PR is not auto-merging. Operator may need to bring staging up to date with main, then re-trigger this workflow via workflow_dispatch."
-            fi
+            echo "Merge queue lands the PR once required gates are green; no human action needed unless gates fail."
          } >> "$GITHUB_STEP_SUMMARY"
+
+          # Hand the PR number to the next step so we can dispatch the
+          # tenant-redeploy chain after the merge queue lands the merge.
+          echo "promote_pr_num=${PR_NUM}" >> "$GITHUB_OUTPUT"
+        id: promote_pr
+
+      # The App token minted above (before the promote-PR step) is
+      # also used by the polling tail below. Defense-in-depth: with
+      # the merge-queue-landed merge now using the App token, the
+      # main-branch push event SHOULD fire the publish/canary/redeploy
+      # cascade naturally — but if for any reason it doesn't (e.g. an
+      # unrelated event-suppression edge case), the explicit dispatches
+      # below still wake the chain.
+      - name: Wait for promote merge, then dispatch publish + redeploy (#2357)
+        # Defense-in-depth dispatch. With the auto-merge call above
+        # now using the App token (this commit), the merge-queue-landed
+        # merge SHOULD fire publish-workspace-server-image naturally
+        # via on:push:[main] — App-token-initiated pushes DO trigger
+        # workflow_run cascades, unlike GITHUB_TOKEN-initiated ones
+        # (the documented "no recursion" rule —
+        # https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
+        #
+        # This explicit dispatch stays as belt-and-suspenders for any
+        # edge case where the natural cascade misfires. If it never
+        # observably fires after this token swap (i.e. the publish
+        # workflow has already started by the time we get here), the
+        # second dispatch is a harmless no-op (publish-workspace-server-image
+        # has its own concurrency group that dedupes).
+        #
+        # See PR for #2357: pre-fix the merge action was via
+        # GITHUB_TOKEN, suppressing the cascade and forcing this tail
+        # to be the SOLE chain trigger. With the auto-merge token swap
+        # the tail becomes redundant in the happy path; keep until
+        # we've observed >=10 successful natural cascades, then drop.
+        if: steps.promote_pr.outputs.promote_pr_num != ''
+        env:
+          GH_TOKEN: ${{ steps.app-token.outputs.token }}
+          REPO: ${{ github.repository }}
+          PR_NUM: ${{ steps.promote_pr.outputs.promote_pr_num }}
+        run: |
+          # Poll for merge — max 30 min (60 × 30s). The merge queue
+          # typically lands within 5-10 min when gates are green. Break
+          # early if the PR is closed without merging (operator action,
+          # gates flipped red post-approval, branch-protection rejection)
+          # so we don't tie up a runner for the full 30 min on a dead PR.
+          MERGED=""
+          STATE=""
+          for _ in $(seq 1 60); do
+            VIEW=$(gh pr view "$PR_NUM" --repo "$REPO" --json mergedAt,state)
+            MERGED=$(echo "$VIEW" | jq -r '.mergedAt // ""')
+            STATE=$(echo "$VIEW" | jq -r '.state // ""')
+            if [ -n "$MERGED" ] && [ "$MERGED" != "null" ]; then
+              echo "::notice::Promote PR #${PR_NUM} merged at ${MERGED}"
+              break
+            fi
+            if [ "$STATE" = "CLOSED" ]; then
+              echo "::warning::Promote PR #${PR_NUM} was closed without merging — skipping deploy dispatch."
+              exit 0
+            fi
+            sleep 30
+          done
+
+          if [ -z "$MERGED" ] || [ "$MERGED" = "null" ]; then
+            echo "::warning::Promote PR #${PR_NUM} didn't merge within 30min — skipping deploy dispatch (manually run \`gh workflow run publish-workspace-server-image.yml --ref main\` once it lands)."
+            exit 0
+          fi
+
+          # Dispatch publish on main using the App token. App-initiated
+          # workflow_dispatch DOES propagate the workflow_run cascade,
+          # unlike GITHUB_TOKEN-initiated dispatch.
+          # publish completes → canary-verify chains via workflow_run →
+          # redeploy-tenants-on-main chains via workflow_run + branches:[main].
+          if gh workflow run publish-workspace-server-image.yml \
+              --repo "$REPO" --ref main 2>&1; then
+            echo "::notice::Dispatched publish-workspace-server-image on ref=main as molecule-ai App — canary-verify and redeploy-tenants-on-main will chain via workflow_run."
+            {
+              echo "## 🚀 Tenant redeploy chain dispatched"
+              echo
+              echo "- publish-workspace-server-image (workflow_dispatch on \`main\`, actor: \`molecule-ai[bot]\`)"
+              echo "- canary-verify will chain on completion"
+              echo "- redeploy-tenants-on-main will chain on canary green"
+            } >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "::error::Failed to dispatch publish-workspace-server-image. Run manually: gh workflow run publish-workspace-server-image.yml --ref main"
+          fi
+
+          # ALSO dispatch auto-sync-main-to-staging.yml. Same root cause as
+          # publish above (issue #2357): the merge-queue-initiated push to
+          # main is by GITHUB_TOKEN → no `on: push` triggers fire downstream.
+          # Without this dispatch, every staging→main promote leaves staging
+          # one merge commit BEHIND main, which silently dead-locks the NEXT
+          # promote PR as `mergeStateStatus: BEHIND` because main's
+          # branch-protection has `strict: true`. Verified empirically on
+          # 2026-05-02 against PR #2442 (Phase 2 promote): only the explicit
+          # publish-workspace-server-image dispatch fired on the previous
+          # promote SHA 76c604fb, while auto-sync silently no-op'd, leaving
+          # staging behind for ~24h until manually bridged.
+          if gh workflow run auto-sync-main-to-staging.yml \
+              --repo "$REPO" --ref main 2>&1; then
+            echo "::notice::Dispatched auto-sync-main-to-staging on ref=main as molecule-ai App — staging will absorb the new main merge commit via PR + merge queue."
+          else
+            echo "::error::Failed to dispatch auto-sync-main-to-staging. Run manually: gh workflow run auto-sync-main-to-staging.yml --ref main"
+          fi
@@ -1,404 +0,0 @@
-name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift
-
-# Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by
-# auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml.
-#
-# ============================================================
-# Why this workflow exists
-# ============================================================
-#
-# PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which
-# 405s on Gitea's GraphQL endpoint — with a direct git push from the
-# `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review
-# weakest spot #3 of that PR:
-#
-#   "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is
-#    rotated without updating the repo secret, every push to main
-#    fails red on the auto-sync push step. The workflow surfaces the
-#    failure mode in the step summary (failure mode B in the header),
-#    but there's no proactive monitoring."
-#
-# Detection latency under the status quo: rotation is only caught on
-# the next push to `main`. During quiet periods (no main push for
-# many hours) the staging-superset-of-main invariant silently breaks.
-#
-# This workflow closes the gap: every 6 hours, it fires the auth
-# surface that auto-sync depends on and emits a red workflow status
-# if AUTO_SYNC_TOKEN has drifted out of validity.
-#
-# ============================================================
-# What this checks (Option B — read-only verify)
-# ============================================================
-#
-# 1. `GET /api/v1/user` against Gitea with the token → validates the
-#    token authenticates AND resolves to `devops-engineer` (catches
-#    the case where the token was regenerated under a different
-#    persona by mistake).
-# 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token →
-#    validates the token has `read:repository` scope on this repo
-#    (the v2 scope contract — see saved memory
-#    `reference_persona_token_v2_scope`).
-# 3. `git push --dry-run` of the current staging SHA back to
-#    `refs/heads/staging` via `https://oauth2:<token>@<gitea>/...`
-#    → validates the EXACT HTTPS basic-auth path that
-#    `actions/checkout` + `git push origin staging` use inside
-#    auto-sync-main-to-staging.yml. NOP by construction (push the
-#    current tip to itself = "Everything up-to-date"); auth is
-#    checked at the smart-protocol handshake BEFORE the empty-diff
-#    computation, so bad token → exit 128 with "Authentication
-#    failed". `git ls-remote` is NOT used here because Gitea
-#    falls back to anonymous read on public repos and would
-#    silently green-light a rotated token.
-#
-# Each step exits non-zero with an actionable error message if it
-# fails. The workflow status itself is the operator-facing surface.
-#
-# ============================================================
-# What this does NOT check (intentional)
-# ============================================================
-#
-# - **Branch-protection authz** (failure mode C in auto-sync header):
-#   would require an actual write to staging. Already monitored by
-#   `branch-protection-drift.yml` daily. Don't duplicate.
-# - **Conflict resolution** (failure mode A): a real conflict is data-
-#   driven, not auth-driven; can't synthesise it without polluting
-#   staging. Already surfaces immediately on the next main push.
-# - **Concurrency** (failure mode D): handled by workflow concurrency
-#   group on auto-sync, not a credential issue.
-#
-# ============================================================
-# Why Option B (read-only) and not the alternatives
-# ============================================================
-#
-# Considered + rejected (see issue #72 for full write-up):
-#
-# - **Option A — full auto-sync on schedule**: every run creates a
-#   no-op merge commit on staging when main hasn't advanced. 4 noise
-#   commits/day. And races the real `push:` trigger when main has
-#   advanced. Rejected.
-#
-# - **Option C — push to dedicated `auto-sync-canary` branch**: would
-#   exercise authz too, but adds branch noise on Gitea AND requires
-#   maintaining a second branch protection (or expanding staging's
-#   whitelist to a junk branch). Authz already covered by
-#   `branch-protection-drift.yml`. Rejected.
-#
-# Prior art for the chosen Option B shape:
-#   - Cloudflare's `/user/tokens/verify` endpoint (read-only auth
-#     probe explicitly designed for credential canaries).
-#   - AWS Secrets Manager rotation Lambda's `testSecret` step (auth
-#     probe before promoting AWSPENDING → AWSCURRENT).
-#   - HashiCorp Vault's `vault token lookup` for renewal canaries.
-#
-# ============================================================
-# Operator runbook — what to do when this workflow goes RED
-# ============================================================
-#
-# 1. **Identify which step failed**:
-#    - Step "Verify token authenticates as devops-engineer" red →
-#      token is invalid OR resolves to wrong persona.
-#    - Step "Verify token has repo read scope" red → token valid but
-#      stripped of `read:repository` scope (or repo perms changed).
-#    - Step "Verify git HTTPS auth path via no-op dry-run push to
-#      staging" red → token rotated/revoked OR Gitea git-HTTPS
-#      surface is broken (rare). Auth check happens on the
-#      smart-protocol handshake, separate from the API path.
-#
-# 2. **Re-issue the token** on the operator host:
-#    ```
-#    ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \
-#      gitea admin user generate-access-token \
-#      --username devops-engineer \
-#      --token-name persona-devops-engineer-vN \
-#      --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"'
-#    ```
-#    Update `/etc/molecule-bootstrap/agent-secrets.env` in place
-#    (per `feedback_unified_credentials_file`). The previous token
-#    file lands at `.bak.<date>`.
-#
-# 3. **Update the repo Actions secret** at:
-#    Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN
-#    Paste the new token. (Don't echo it in chat — but per
-#    `feedback_passwords_in_chat_are_burned`, a paste in a 1:1
-#    Claude session is within trust boundary.)
-#
-# 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN.
-#
-# 5. **Backfill any missed main → staging syncs** by re-running
-#    `auto-sync-main-to-staging.yml` from its workflow_dispatch
-#    surface, OR by pushing an empty commit to main (if you'd
-#    rather force a real trigger).
-#
-# ============================================================
-# Security notes
-# ============================================================
-#
-# - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`,
-#   `git ls-remote`). No write paths. Same blast-radius profile as
-#   `actions/checkout` on a public repo.
-# - The token NEVER appears in logs: every `curl` uses a header
-#   variable, never inline; the `git ls-remote` URL builds the
-#   `oauth2:$TOKEN@host` form into a single env var that's not
-#   echoed. GitHub Actions secret-masking covers anything that does
-#   slip through.
-# - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow
-#   under monitor uses. Per least-privilege we deliberately do NOT
-#   broaden scope for the canary.
-
-on:
-  schedule:
-    # Every 6 hours at :17 (offsets the cron herd at :00). Justification
-    # from issue #72: cheap to run (~5s wall-clock, no quota), 3h average
-    # detection latency, 6h max. 1h would be 24× the runs for marginal
-    # benefit; daily would be 6× longer latency and worse than status
-    # quo on a quiet-main day.
-    - cron: '17 */6 * * *'
-  workflow_dispatch:
-
-# No concurrency group needed — the canary is read-only and idempotent.
-# Two parallel runs (e.g. operator dispatch during a scheduled tick) are
-# harmless: same result, doubled HTTPS calls, no shared state.
-
-permissions:
-  contents: read
-
-jobs:
-  verify-token:
-    name: Verify AUTO_SYNC_TOKEN validity
-    runs-on: ubuntu-latest
-    # 2 min surfaces hangs (Gitea API stall, DNS issue) within one
-    # cron interval. Realistic worst case is ~10s: 2 curls + 1 git
-    # ls-remote, each capped by the explicit timeouts below.
-    timeout-minutes: 2
-
-    env:
-      # Pinned in env so individual steps can read it without
-      # repeating the secret reference. GitHub masks the value in
-      # logs automatically.
-      AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
-      # MUST stay in sync with auto-sync-main-to-staging.yml's
-      # `git config user.name "devops-engineer"` line. Renaming the
-      # devops-engineer persona requires updating both files (and
-      # the staging branch protection's `push_whitelist_usernames`).
-      EXPECTED_PERSONA: devops-engineer
-      GITEA_HOST: git.moleculesai.app
-      REPO_PATH: molecule-ai/molecule-core
-
-    steps:
-      - name: Verify AUTO_SYNC_TOKEN secret is configured
-        # Schedule-vs-dispatch behaviour split, per
-        # `feedback_schedule_vs_dispatch_secrets_hardening`:
-        #
-        #   - schedule: hard-fail when the secret is missing. The
-        #     whole point of the canary is to surface drift; soft-
-        #     skipping on missing-secret would make the canary
-        #     itself drift-invisible (sweep-cf-orphans #2088 lesson).
-        #   - workflow_dispatch: hard-fail too — there's no scenario
-        #     where an operator wants this canary to silently no-op.
-        #     The workflow has no other ad-hoc utility; if you ran
-        #     it, you wanted the answer.
-        run: |
-          if [ -z "${AUTO_SYNC_TOKEN}" ]; then
-            echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2
-            echo "::error::Set it at Settings → Secrets and variables → Actions." >&2
-            echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2
-            exit 1
-          fi
-          echo "AUTO_SYNC_TOKEN is configured (value masked)."
-
-      - name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }}
-        # Calls Gitea's `/api/v1/user` — the canonical
-        # auth-probe-with-no-side-effects endpoint (mirrors
-        # Cloudflare's /user/tokens/verify).
-        #
-        # Failure surfaces:
-        #   - HTTP 401: token invalid (rotated, revoked, or never
-        #     correctly registered).
-        #   - HTTP 200 but username != devops-engineer: token was
-        #     regenerated under the wrong persona — this would let
-        #     auth pass but commit attribution would be wrong, and
-        #     branch-protection authz would fail because only
-        #     `devops-engineer` is whitelisted.
-        run: |
-          set -euo pipefail
-          response_file="$(mktemp)"
-          code_file="$(mktemp)"
-          # `--max-time 30`: full call ceiling. `--connect-timeout 10`:
-          # DNS + TCP. `-w "%{http_code}"` routed to a tempfile so curl's
-          # exit code can't pollute the captured status — see
-          # feedback_curl_status_capture_pollution + the
-          # `lint-curl-status-capture.yml` gate that rejects the unsafe
-          # `$(curl ... || echo "000")` shape.
-          set +e
-          curl -sS -o "$response_file" \
-            --max-time 30 --connect-timeout 10 \
-            -w "%{http_code}" \
-            -H "Authorization: token ${AUTO_SYNC_TOKEN}" \
-            -H "Accept: application/json" \
-            "https://${GITEA_HOST}/api/v1/user" >"$code_file" 2>/dev/null
-          set -e
-          status=$(cat "$code_file" 2>/dev/null || true)
-          [ -z "$status" ] && status="000"
-
-          if [ "$status" != "200" ]; then
-            echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2
-            echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2
-            echo "::error::Runbook: see header comment of this workflow file." >&2
-            # Print response body but redact anything that looks like a token.
-            sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
-            exit 1
-          fi
-
-          username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file")
-          if [ "$username" != "${EXPECTED_PERSONA}" ]; then
-            echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2
-            echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2
-            echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2
-            exit 1
-          fi
-          echo "Token authenticates as: $username ✓"
-
-      - name: Verify token has repo read scope
-        # `GET /api/v1/repos/<owner>/<repo>` requires `read:repository`
-        # on the persona's v2 scope contract. If the scope was
-        # narrowed/dropped on rotation we catch it here, before the
-        # next main push reveals it via a checkout failure.
-        run: |
-          set -euo pipefail
-          response_file="$(mktemp)"
-          code_file="$(mktemp)"
-          # See first probe step for the rationale on the tempfile-routed
-          # `-w "%{http_code}"` pattern — the unsafe `|| echo "000"` shape
-          # is rejected by lint-curl-status-capture.yml.
-          set +e
-          curl -sS -o "$response_file" \
-            --max-time 30 --connect-timeout 10 \
-            -w "%{http_code}" \
-            -H "Authorization: token ${AUTO_SYNC_TOKEN}" \
-            -H "Accept: application/json" \
-            "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" >"$code_file" 2>/dev/null
-          set -e
-          status=$(cat "$code_file" 2>/dev/null || true)
-          [ -z "$status" ] && status="000"
-
-          if [ "$status" != "200" ]; then
-            echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2
-            echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2
-            echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2
-            sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
-            exit 1
-          fi
-          echo "Token has read:repository on ${REPO_PATH} ✓"
-
-      - name: Verify git HTTPS auth path via no-op dry-run push to staging
-        # Final probe: exercise the EXACT auth path that
-        # `actions/checkout` + `git push origin staging` use in
-        # auto-sync-main-to-staging.yml. Gitea's API and git-HTTPS
-        # surfaces share the token-lookup code path internally but
-        # the wire-level error shapes differ — historically (#173)
-        # the API path was healthy while git-HTTPS rejected, so
-        # checking only the API would have given false-green.
-        #
-        # IMPORTANT: `git ls-remote` on a public repo (which
-        # molecule-core is) succeeds even with a junk token because
-        # Gitea falls back to anonymous-read. `ls-remote` therefore
-        # CANNOT validate auth on this surface. We use
-        # `git push --dry-run` instead — push is auth-gated even on
-        # public repos.
-        #
-        # NOP shape: read the current staging SHA via authenticated
-        # ls-remote (the SHA itself is public; auth is incidental
-        # here, used only to colocate the discovery in one step), then
-        # `git push --dry-run <SHA>:refs/heads/staging`. Pushing the
-        # current tip back to itself is "Everything up-to-date" with
-        # exit 0 when auth succeeds. With a bad token Gitea returns
-        # HTTP 401 in the smart-protocol handshake and git exits 128
-        # with "Authentication failed".
-        #
-        # The dry-run never reaches Gitea's pre-receive hook (which
-        # is where branch-protection authz runs), so this probe does
-        # not validate failure mode C. That's intentional —
-        # branch-protection-drift.yml owns authz monitoring; this
-        # canary owns auth.
-        env:
-          # Don't hang waiting for password prompt if auth fails on a
-          # terminal-attached run. (In Actions there's no terminal,
-          # but the env-var hardens against an interactive runner
-          # config.)
-          GIT_TERMINAL_PROMPT: "0"
-        run: |
-          set -euo pipefail
-          # Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the
-          # URL as a local var that's never echoed.
-          url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}"
-
-          # Step a: read current staging SHA. ~1KB; auth-gated only
-          # on private repos but always works on public — used here
-          # only to discover the SHA, not to validate auth.
-          staging_ref=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1) || {
-            redacted=$(echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
-            echo "::error::ls-remote against staging failed (network/DNS issue):" >&2
-            echo "$redacted" >&2
-            exit 1
-          }
-          if ! echo "$staging_ref" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then
-            echo "::error::ls-remote returned unexpected shape:" >&2
-            echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g" >&2
-            exit 1
-          fi
-          staging_sha=$(echo "$staging_ref" | awk '{print $1}')
-
-          # Step b: spin up an ephemeral local repo. `git push` always
-          # requires a local repo even when pushing a remote SHA that
-          # isn't in the local object DB (the protocol negotiates and
-          # discovers we don't need to send any objects). We don't use
-          # `actions/checkout` for this — it would clone the whole
-          # repo (~hundreds of MB) for what's essentially `git init`.
-          tmp_repo="$(mktemp -d)"
-          trap 'rm -rf "$tmp_repo"' EXIT
-          git -C "$tmp_repo" init -q
-          # Author config required for any git operation; values are
-          # arbitrary because nothing gets committed here.
-          git -C "$tmp_repo" config user.email canary@auto-sync.local
-          git -C "$tmp_repo" config user.name auto-sync-canary
-
-          # Step c: dry-run push the current staging SHA back to
-          # staging. NOP by construction — the remote tip equals the
-          # SHA we're pushing, so "Everything up-to-date" is the
-          # success path.
-          #
-          # Authentication is checked at the smart-protocol handshake,
-          # BEFORE the dry-run can compute an empty diff. Bad token
-          # → "Authentication failed", exit 128. Good token → exit 0.
-          set +e
-          push_out=$(timeout 30s git -C "$tmp_repo" push --dry-run "$url" "${staging_sha}:refs/heads/staging" 2>&1)
-          push_rc=$?
-          set -e
-
-          if [ "$push_rc" -ne 0 ]; then
-            redacted=$(echo "$push_out" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
-            echo "::error::Token rotation suspected: git push --dry-run against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path (exit $push_rc)." >&2
-            echo "::error::This is the EXACT auth path that actions/checkout + git push use in auto-sync-main-to-staging.yml." >&2
-            echo "::error::Likely cause: AUTO_SYNC_TOKEN was rotated/revoked on Gitea but the repo Actions secret was not updated. Runbook: see header." >&2
-            echo "$redacted" >&2
-            exit 1
-          fi
-
-          echo "git HTTPS auth path: NOP push --dry-run to staging → ${staging_sha:0:8} ✓"
-
-      - name: Summarise canary result
-        # Everything passed — surface a green summary. (Failures
-        # already wrote ::error:: lines and exited above; if we got
-        # here, all three probes passed.)
-        run: |
-          {
-            echo "## Auto-sync canary: GREEN"
-            echo ""
-            echo "AUTO_SYNC_TOKEN is healthy:"
-            echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓"
-            echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓"
-            echo "- Git HTTPS auth path: no-op dry-run push to \`refs/heads/staging\` succeeds ✓"
-            echo ""
-            echo "Auto-sync main → staging will succeed on the next push to main."
-            echo "If this canary ever goes RED, see the runbook in this workflow's header."
-          } >> "$GITHUB_STEP_SUMMARY"
@@ -3,138 +3,85 @@ name: Auto-sync main → staging
 # Reflects every push to `main` back onto `staging` so the
 # staging-as-superset-of-main invariant holds.
 #
-# ============================================================
-# What this workflow does
-# ============================================================
+# Background:
 #
-# On every push to `main`:
-#   1. Checks if staging already contains main → no-op.
-#   2. Fetches both branches, merges main into staging in the
-#      runner workspace (fast-forward if possible, else
-#      `--no-ff` merge commit).
-#   3. Pushes staging directly to origin via the
-#      `devops-engineer` persona's `AUTO_SYNC_TOKEN`.
+# `auto-promote-staging.yml` advances main via `git merge --ff-only`
+# + `git push origin main` — that's a clean fast-forward, no merge
+# commit. But manual merges of `staging → main` PRs through the
+# GitHub UI / API create a merge commit on main that staging
+# doesn't have. The next `staging → main` PR then evaluates as
+# "BEHIND" because staging is missing that merge commit, requiring
+# a manual `gh pr update-branch` round-trip.
 #
-# Authoritative path: a single `git push origin staging` from
-# inside this workflow is the SSOT for advancing staging after
-# a main push. No PR, no merge queue, no human approval —
-# staging is mechanically maintained as a superset of main.
+# This happened twice on 2026-04-28 (PRs #2202, #2205, both manual
+# bridges). Each time the bridge needed update-branch + a re-CI
+# round before merging. Operationally annoying and avoidable.
 #
-# `auto-promote-staging.yml` is the reverse-direction
-# counterpart (staging → main, gated on green CI). Together
-# they keep the staging-superset-of-main invariant tight.
+# Architecture:
 #
-# ============================================================
-# Why direct push (and not "open a PR")
-# ============================================================
+# This repo's `staging` branch is protected by a `merge_queue`
+# ruleset (id 15500102) that blocks ALL direct pushes — no bypass
+# even for org admins or the GitHub Actions integration. Direct
+# `git push origin staging` returns GH013. So instead of pushing
+# directly, this workflow:
 #
-# Pre-2026-05-06 the canonical SCM was GitHub.com, where:
-#   - The `staging` branch had a `merge_queue` ruleset that
-#     blocked ALL direct pushes (no bypass even for org
-#     admins or the GitHub Actions integration).
-#   - Therefore this workflow opened a PR via `gh pr create`
-#     and let auto-merge land it through the queue.
+#   1. Checks if main is already in staging's ancestry → no-op.
+#   2. Creates an `auto-sync/main-<sha>` branch from staging.
+#   3. Tries `git merge --ff-only origin/main` → if staging hasn't
+#      diverged this is a clean ff.
+#   4. Otherwise `git merge --no-ff origin/main` to absorb main's
+#      tip while keeping staging's history.
+#   5. Pushes the auto-sync branch.
+#   6. Opens a PR (base=staging, head=auto-sync/main-<sha>) and
+#      enables auto-merge so the merge queue lands it.
 #
-# Post-2026-05-06 the canonical SCM is Gitea
-# (`git.moleculesai.app/molecule-ai/molecule-core`). Gitea:
-#   - Has no `merge_queue` concept.
-#   - Allows direct push to protected branches via per-user
-#     `push_whitelist_usernames` on the branch protection.
-#   - Does not expose a GraphQL endpoint, so `gh pr create`
-#     returns `HTTP 405 Method Not Allowed
-#     (https://git.moleculesai.app/api/graphql)` — the
-#     pre-suspension architecture cannot work on Gitea.
+# This mirrors the path human PRs take through staging — same
+# rules, same gates, no special-case bypass.
 #
-# The molecule-ai/molecule-core staging branch protection
-# (verified via `GET /api/v1/repos/.../branch_protections`)
-# whitelists `devops-engineer` for direct push. So the
-# correct Gitea-shape architecture is: authenticate as
-# `devops-engineer`, merge locally, push staging directly.
+# Loop safety:
 #
-# This is structurally simpler than the GitHub-era PR dance
-# and removes the dependence on `gh` CLI / GraphQL entirely.
+# `GITHUB_TOKEN`-authored merges (including the merge queue's land
+# of the auto-sync PR) do NOT trigger downstream workflow runs
+# (GitHub Actions safety). So when the auto-sync PR lands on
+# staging, `auto-promote-staging.yml` is NOT triggered by that
+# push. The next developer push to staging triggers auto-promote
+# normally. No loop possible.
 #
-# ============================================================
-# Identity + token (anti-bot-ring per saved-memory
-# `feedback_per_agent_gitea_identity_default`)
-# ============================================================
+# Concurrency:
 #
-# This workflow uses `secrets.AUTO_SYNC_TOKEN`, which is a
-# personal access token issued to the `devops-engineer`
-# persona on Gitea — NOT the founder PAT. The bot-ring
-# fingerprint that triggered the GitHub org suspension on
-# 2026-05-06 was characterised by founder PAT acting as CI
-# at machine speed; per-persona identities split the
-# attribution honestly.
-#
-# Token scope on Gitea: repo write. Push target restricted
-# to `staging` (this workflow is the only writer; main is
-# untouched). Compromise blast radius: bounded to staging
-# branch + this repo's read surface.
-#
-# Commits are authored by the persona email
-# `devops-engineer@agents.moleculesai.app` so commit history
-# reflects which automation produced the merge.
-#
-# ============================================================
-# Failure modes & operational notes
-# ============================================================
-#
-# A — staging has commits main doesn't, and the merge
-#     conflicts:
-#     - The `--no-ff` merge step exits non-zero. Workflow
-#       fails red. Operator (devops-engineer or human)
-#       resolves manually:
-#         git fetch origin
-#         git checkout staging
-#         git merge --no-ff origin/main
-#         # resolve conflicts
-#         git push origin staging
-#     - Step summary surfaces the conflict so the failed run
-#       is self-explanatory.
-#
-# B — `AUTO_SYNC_TOKEN` rotated / wrong scope:
-#     - `git push` step exits non-zero with `HTTP 401` /
-#       `403`. Step summary surfaces the failed push.
-#     - Re-issue the token from `~/.molecule-ai/personas/`
-#       on the operator host and update the repo Actions
-#       secret. Re-run the workflow.
-#
-# C — staging branch protection no longer whitelists
-#     `devops-engineer`:
-#     - `git push` exits non-zero with a Gitea protected-
-#       branch rejection. Step summary surfaces it.
-#     - Re-add `devops-engineer` to
-#       `push_whitelist_usernames` on the staging
-#       protection (Settings → Branches → staging).
-#
-# D — concurrent push to main while a sync is in flight:
-#     - The `concurrency` group below serialises runs.
-#       The second waits for the first; if main advances
-#       again while we're syncing, the second run picks
-#       up the new tip on its own fetch.
-#
-# ============================================================
-# Loop safety
-# ============================================================
-#
-# The push to staging from this workflow does NOT itself
-# fire a `push: branches: [main]` event (different branch),
-# so there's no risk of self-recursion. `auto-promote-staging.yml`
-# fires on `workflow_run` of CI etc. — it sees the new
-# staging tip on its next gate-completion event, NOT on this
-# push directly. No loop.
+# Two pushes to main in quick succession (e.g., manual UI merge
+# immediately followed by auto-promote-staging's ff-merge) could
+# otherwise open two overlapping auto-sync PRs. The concurrency
+# group serializes runs; the second waits for the first to exit.
+# (The first run exits after opening + auto-merge-queueing the PR,
+# not after the merge actually completes — so multiple PRs can be
+# open simultaneously, but the merge queue handles them serially.)

 on:
  push:
    branches: [main]
-  # workflow_dispatch lets operators manually backfill a
-  # missed sync (e.g. if AUTO_SYNC_TOKEN was rotated and a
-  # main push slipped through while the secret was stale).
+  # workflow_dispatch lets:
+  #   1. Operators manually backfill a missed sync (e.g. after a manual
+  #      UI merge that the runner missed).
+  #   2. auto-promote-staging.yml's polling tail explicitly invoke us
+  #      after the promote PR lands. This is load-bearing: when the
+  #      merge queue lands a promote-PR merge, the resulting push to
+  #      `main` is "by GITHUB_TOKEN", and per GitHub's no-recursion
+  #      rule (https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow)
+  #      that push event does NOT fire any downstream workflows. The
+  #      `on: push` trigger above is silently dead for the very pattern
+  #      we exist to handle. Verified empirically 2026-05-02 against
+  #      SHA 76c604fb (PR #2437 staging→main): only ONE workflow fired
+  #      (publish-workspace-server-image, dispatched explicitly by
+  #      auto-promote's polling tail with an App token). Every other
+  #      `on: push: branches: [main]` workflow — including this one —
+  #      was suppressed. Until the underlying merge call moves to an
+  #      App token, an explicit dispatch is the only reliable path.
  workflow_dispatch:

 permissions:
  contents: write
+  pull-requests: write

 concurrency:
  group: auto-sync-main-to-staging
@@ -142,25 +89,26 @@ concurrency:

 jobs:
  sync-staging:
+    # ubuntu-latest matches every other workflow in this repo. The
+    # earlier `[self-hosted, macos, arm64]` was a copy-paste artefact
+    # from the molecule-controlplane repo (which IS private and uses a
+    # Mac runner) — molecule-core has no Mac runner registered, so the
+    # job sat unassigned whenever the trigger fired. Verified 2026-05-02:
+    # this is the ONLY workflow in molecule-core/.github/workflows/ with
+    # a non-ubuntu runs-on.
    runs-on: ubuntu-latest
    steps:
-      - name: Checkout staging (with devops-engineer push token)
+      - name: Checkout staging
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          ref: staging
-          # AUTO_SYNC_TOKEN authenticates as the
-          # `devops-engineer` Gitea persona — the only
-          # identity whitelisted for direct push to
-          # staging. See header comment for context.
          token: ${{ secrets.AUTO_SYNC_TOKEN }}

      - name: Configure git author
        run: |
-          # Per-persona identity, NOT founder PAT.
-          # `feedback_per_agent_gitea_identity_default`.
-          git config user.name "devops-engineer"
-          git config user.email "devops-engineer@agents.moleculesai.app"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"

      - name: Check if staging already contains main
        id: check
@@ -170,7 +118,7 @@ jobs:
          if git merge-base --is-ancestor origin/main HEAD; then
            echo "needs_sync=false" >> "$GITHUB_OUTPUT"
            {
-              echo "## No-op"
+              echo "## ✅ No-op"
              echo
              echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))."
            } >> "$GITHUB_STEP_SUMMARY"
@@ -178,78 +126,112 @@ jobs:
            echo "needs_sync=true" >> "$GITHUB_OUTPUT"
            MAIN_SHORT=$(git rev-parse --short=8 origin/main)
            echo "main_short=${MAIN_SHORT}" >> "$GITHUB_OUTPUT"
-            echo "::notice::staging is missing main's tip (${MAIN_SHORT}) — merging in-runner and pushing"
+            echo "branch=auto-sync/main-${MAIN_SHORT}" >> "$GITHUB_OUTPUT"
+            echo "::notice::staging is missing main's tip (${MAIN_SHORT}) — opening sync PR"
          fi

-      - name: Merge main into staging (in-runner)
+      - name: Create auto-sync branch + merge main
        if: steps.check.outputs.needs_sync == 'true'
-        id: merge
+        id: prep
        run: |
          set -euo pipefail
-          # Already on staging from checkout. Try fast-forward
-          # first (cleanest history); fall back to merge commit
-          # if staging has commits main doesn't.
+          BRANCH="${{ steps.check.outputs.branch }}"
+
+          # If a previous auto-sync run already opened a branch for the
+          # same main sha, prefer reusing it (idempotent behavior on
+          # workflow restart). Force-update from latest staging anyway
+          # so it absorbs any staging-side commits that landed since.
+          git checkout -B "$BRANCH"
+
          if git merge --ff-only origin/main; then
            echo "did_ff=true" >> "$GITHUB_OUTPUT"
-            echo "::notice::Fast-forwarded staging to origin/main"
+            echo "::notice::Fast-forwarded ${BRANCH} to origin/main"
          else
            echo "did_ff=false" >> "$GITHUB_OUTPUT"
-            if ! git merge --no-ff origin/main \
-                -m "chore: sync main → staging (auto, ${{ steps.check.outputs.main_short }})"; then
+            if ! git merge --no-ff origin/main -m "chore: sync main → staging (auto)"; then
              # Hygiene: leave the work tree clean before failing.
              git merge --abort || true
              {
-                echo "## Conflict"
+                echo "## ❌ Conflict"
                echo
                echo "Auto-merge \`main → staging\` failed with conflicts."
-                echo "A human (or devops-engineer persona) needs to resolve manually:"
-                echo
-                echo '```'
-                echo "git fetch origin"
-                echo "git checkout staging"
-                echo "git merge --no-ff origin/main"
-                echo "# resolve conflicts"
-                echo "git push origin staging"
-                echo '```'
+                echo "A human needs to resolve manually."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
            fi
          fi

-      - name: Push staging to origin
+      - name: Push auto-sync branch
        if: steps.check.outputs.needs_sync == 'true'
        run: |
          set -euo pipefail
-          # Direct push to staging. devops-engineer persona is
-          # whitelisted for direct push on the staging branch
-          # protection (Settings → Branches → staging).
-          #
-          # No --force / --force-with-lease: a fast-forward or
-          # legitimate merge commit on top of current staging
-          # is the only thing we'd ever push. If origin/staging
-          # advanced under us (concurrent merge), the push
-          # legitimately rejects and the next run picks up the
-          # new state.
-          if ! git push origin staging; then
-            {
-              echo "## Push rejected"
-              echo
-              echo "Direct push to \`staging\` failed. Likely causes:"
-              echo "- \`AUTO_SYNC_TOKEN\` rotated / wrong scope (HTTP 401/403)"
-              echo "- \`devops-engineer\` no longer in"
-              echo "  \`push_whitelist_usernames\` on the staging"
-              echo "  branch protection (HTTP 422)"
-              echo "- staging advanced concurrently — re-running this"
-              echo "  workflow on the new main tip will pick it up"
-            } >> "$GITHUB_STEP_SUMMARY"
-            exit 1
+          # Force-with-lease so a concurrent auto-sync run can't
+          # silently clobber an in-flight branch we just updated. If a
+          # different writer touched the branch, we abort and the next
+          # run picks up the latest state.
+          git push --force-with-lease origin "${{ steps.check.outputs.branch }}"
+
+      - name: Open auto-sync PR + enable auto-merge
+        if: steps.check.outputs.needs_sync == 'true'
+        env:
+          GH_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
+          BRANCH: ${{ steps.check.outputs.branch }}
+          MAIN_SHORT: ${{ steps.check.outputs.main_short }}
+          DID_FF: ${{ steps.prep.outputs.did_ff }}
+        run: |
+          set -euo pipefail
+
+          # Find existing PR for this branch (idempotent on workflow
+          # restart) before creating a new one.
+          PR_NUM=$(gh pr list --head "$BRANCH" --base staging --state open --json number --jq '.[0].number // ""')
+
+          if [ -z "$PR_NUM" ]; then
+            # Body lives in a temp file to keep the multi-line content
+            # out of the YAML block scalar (un-indented newlines inside
+            # an inline shell string break YAML parsing).
+            BODY_FILE=$(mktemp)
+            if [ "$DID_FF" = "true" ]; then
+              TITLE="chore: sync main → staging (auto, ff to ${MAIN_SHORT})"
+              cat > "$BODY_FILE" <<EOFBODY
+          Automated fast-forward of \`staging\` to \`origin/main\` (\`${MAIN_SHORT}\`). Staging has no in-flight commits that diverge from main. Merge queue lands this; no human action needed.
+
+          This PR is auto-generated by \`.github/workflows/auto-sync-main-to-staging.yml\` on every push to \`main\`. It exists because this repo's \`staging\` branch has a \`merge_queue\` ruleset that blocks direct pushes — even from the GitHub Actions integration.
+          EOFBODY
+            else
+              TITLE="chore: sync main → staging (auto, merge ${MAIN_SHORT})"
+              cat > "$BODY_FILE" <<EOFBODY
+          Automated merge of \`origin/main\` (\`${MAIN_SHORT}\`) into \`staging\`. Staging has commits main doesn't, so this is a non-ff merge that absorbs main's tip. Merge queue lands this.
+
+          This PR is auto-generated by \`.github/workflows/auto-sync-main-to-staging.yml\` on every push to \`main\`.
+          EOFBODY
+            fi
+
+            # gh pr create prints the URL on stdout; extract the PR number.
+            PR_URL=$(gh pr create \
+              --base staging \
+              --head "$BRANCH" \
+              --title "$TITLE" \
+              --body-file "$BODY_FILE")
+            PR_NUM=$(echo "$PR_URL" | grep -oE '[0-9]+$' | tail -1)
+            rm -f "$BODY_FILE"
+            echo "::notice::Opened PR #${PR_NUM}"
+          else
+            echo "::notice::Re-using existing PR #${PR_NUM} for ${BRANCH}"
+          fi
+
+          # Enable auto-merge — the merge queue picks it up once
+          # required gates are green. Use --merge for merge commits
+          # (matches the rest of this repo's PR convention).
+          if ! gh pr merge "$PR_NUM" --auto --merge 2>&1; then
+            echo "::warning::Failed to enable auto-merge on PR #${PR_NUM} — operator may need to merge manually."
          fi

          {
-            echo "## Auto-sync succeeded"
+            echo "## ✅ Auto-sync PR opened"
            echo
-            echo "- staging advanced to: \`$(git rev-parse --short=8 HEAD)\`"
-            echo "- main tip: \`${{ steps.check.outputs.main_short }}\`"
-            echo "- Strategy: $([ "${{ steps.merge.outputs.did_ff }}" = "true" ] && echo "fast-forward" || echo "merge commit")"
-            echo "- Pushed by: \`devops-engineer\` (per-agent persona, anti-bot-ring)"
+            echo "- Branch: \`$BRANCH\`"
+            echo "- PR: #$PR_NUM"
+            echo "- Strategy: $([ "$DID_FF" = "true" ] && echo "ff" || echo "merge commit")"
+            echo
+            echo "Merge queue lands the PR once required gates are green; no human action needed unless gates fail."
          } >> "$GITHUB_STEP_SUMMARY"
@@ -57,42 +57,17 @@ jobs:
        id: bump
        if: steps.skip.outputs.skip != 'true'
        env:
-          # Gitea-shape token (act_runner forwards GITHUB_TOKEN as a
-          # short-lived per-run secret with read access to this repo).
-          # We hit `/api/v1/repos/.../pulls?state=closed` directly
-          # because `gh pr list` calls Gitea's GraphQL endpoint, which
-          # returns HTTP 405 (issue #75 / post-#66 sweep).
-          GITEA_TOKEN: ${{ github.token }}
-          REPO: ${{ github.repository }}
-          GITEA_API_URL: ${{ github.server_url }}/api/v1
-          PUSH_SHA: ${{ github.sha }}
+          GH_TOKEN: ${{ github.token }}
        run: |
-          # Find the merged PR whose merge_commit_sha matches this push.
-          # Gitea's `/repos/{owner}/{repo}/pulls?state=closed` returns
-          # PRs sorted newest-first; we paginate up to 50 and jq-filter
-          # on `merge_commit_sha == PUSH_SHA`. Bounded — auto-tag fires
-          # per push to main, so the matching PR is always among the
-          # most recent closures. 50 is comfortably more than the
-          # ~10-20 staging→main promotes that close in any reasonable
-          # window.
-          set -euo pipefail
-          PRS_JSON=$(curl --fail-with-body -sS \
-            -H "Authorization: token ${GITEA_TOKEN}" \
-            -H "Accept: application/json" \
-            "${GITEA_API_URL}/repos/${REPO}/pulls?state=closed&sort=newest&limit=50" \
-            2>/dev/null || echo "[]")
-          PR=$(printf '%s' "$PRS_JSON" \
-            | jq -c --arg sha "$PUSH_SHA" \
-                '[.[] | select(.merged_at != null and .merge_commit_sha == $sha)] | .[0] // empty')
+          # The merged PR for this push commit. `gh pr list --search` finds
+          # closed PRs whose merge commit matches; we take the first.
+          PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "")
          if [ -z "$PR" ] || [ "$PR" = "null" ]; then
-            echo "No merged PR found for ${PUSH_SHA} — defaulting to patch bump."
+            echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump."
            echo "kind=patch" >> "$GITHUB_OUTPUT"
            exit 0
          fi
-          # Gitea returns labels under `.labels[].name`, same shape as
-          # GitHub's REST. The previous `gh pr list --json number,labels`
-          # output was identical; jq filter unchanged.
-          LABELS=$(printf '%s' "$PR" | jq -r '.labels[]?.name // empty')
+          LABELS=$(echo "$PR" | jq -r '.labels[].name')
          if echo "$LABELS" | grep -qx 'release:major'; then
            echo "kind=major" >> "$GITHUB_OUTPUT"
          elif echo "$LABELS" | grep -qx 'release:minor'; then
@@ -1,7 +1,7 @@
 name: Block internal-flavored paths

 # Hard CI gate. Internal content (positioning, competitive briefs, sales
-# playbooks, PMM/press drip, draft campaigns) lives in molecule-ai/internal —
+# playbooks, PMM/press drip, draft campaigns) lives in Molecule-AI/internal —
 # this public monorepo must never re-acquire those paths. CEO directive
 # 2026-04-23 after a fleet-wide audit found 79 internal files leaked here.
 #
@@ -135,7 +135,7 @@ jobs:
            echo "::error::Forbidden internal-flavored paths detected:"
            printf "$OFFENDING"
            echo ""
-            echo "These paths belong in molecule-ai/internal, not this public repo."
+            echo "These paths belong in Molecule-AI/internal, not this public repo."
            echo "See docs/internal-content-policy.md for canonical locations."
            echo ""
            echo "If your file is genuinely public-facing (e.g. a blog post"
@@ -19,7 +19,6 @@ on:
    branches: [staging, main]
    paths:
      - 'tools/branch-protection/**'
-      - '.github/workflows/**'
      - '.github/workflows/branch-protection-drift.yml'

 permissions:
@@ -80,32 +79,3 @@ jobs:
          # Repo-admin scope, needed for /branches/:b/protection.
          GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
        run: bash tools/branch-protection/drift_check.sh
-
-      # Self-test the parity script before running it on the real
-      # workflows — pins the script's classification logic against
-      # synthetic safe/unsafe/missing/unsafe-mix/matrix fixtures so a
-      # regression in the script can't false-pass on the production
-      # workflow audit. Cheap (~0.5s); always runs.
-      - name: Self-test check-name parity script
-        run: bash tools/branch-protection/test_check_name_parity.sh
-
-      # Check-name parity gate (#144 / saved memory
-      # feedback_branch_protection_check_name_parity).
-      #
-      # drift_check.sh asserts the live branch protection matches what
-      # apply.sh would set; check_name_parity.sh closes the orthogonal
-      # gap: it asserts every required check name in apply.sh maps to a
-      # workflow job whose "always emits this status" shape is intact.
-      #
-      # The two checks fail in different scenarios:
-      #
-      #   - drift_check fails → live state was rewritten out-of-band
-      #     (UI click, manual PATCH).
-      #   - check_name_parity fails → an apply.sh required name has no
-      #     emitter, OR the emitting workflow has a top-level paths:
-      #     filter without per-step if-gates (the silent-block shape).
-      #
-      # Cheap (~1s); runs without the admin token because it only reads
-      # apply.sh + .github/workflows/ from the checkout.
-      - name: Run check-name parity gate
-        run: bash tools/branch-protection/check_name_parity.sh
@@ -108,7 +108,7 @@ jobs:
              echo
              echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)."
              echo "Phase 2 canary fleet has not been stood up yet —"
-              echo "see [canary-tenants.md](https://github.com/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
+              echo "see [canary-tenants.md](https://github.com/Molecule-AI/molecule-controlplane/blob/main/docs/canary-tenants.md)."
              echo
              echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready."
            } >> "$GITHUB_STEP_SUMMARY"
@@ -87,7 +87,7 @@ jobs:
        run: go mod download
      - if: needs.changes.outputs.platform == 'true'
        run: go build ./cmd/server
-      # CLI (molecli) moved to standalone repo: github.com/molecule-ai/molecule-cli
+      # CLI (molecli) moved to standalone repo: github.com/Molecule-AI/molecule-cli
      - if: needs.changes.outputs.platform == 'true'
        run: go vet ./... || true
      - if: needs.changes.outputs.platform == 'true'
@@ -165,7 +165,7 @@ jobs:
              # Strip the package-import prefix so we can match .coverage-allowlist.txt
              # entries written as paths relative to workspace-server/.
              # Handle both module paths: platform/workspace-server/... and platform/...
-              rel=$(echo "$file" | sed 's|^github.com/molecule-ai/molecule-monorepo/platform/workspace-server/||; s|^github.com/molecule-ai/molecule-monorepo/platform/||')
+              rel=$(echo "$file" | sed 's|^github.com/Molecule-AI/molecule-monorepo/platform/workspace-server/||; s|^github.com/Molecule-AI/molecule-monorepo/platform/||')

              if echo "$ALLOWLIST" | grep -qxF "$rel"; then
                echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry."
@@ -235,13 +235,7 @@ jobs:
        run: npx vitest run --coverage
      - name: Upload coverage summary as artifact
        if: needs.changes.outputs.canvas == 'true' && always()
-        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
-        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
-        # implement, surfacing as `GHESNotSupportedError: @actions/artifact
-        # v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not
-        # currently supported on GHES`. Drop this pin when Gitea ships
-        # the v4 protocol (tracked: post-Gitea-1.23 followup).
-        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
+        uses: actions/upload-artifact@v3 # pinned to v3 for Gitea act_runner v0.6 compatibility (internal#46)
        with:
          name: canvas-coverage-${{ github.run_id }}
          path: canvas/coverage/
@@ -249,8 +243,8 @@ jobs:
          if-no-files-found: warn

  # MCP Server + SDK removed from CI — now in standalone repos:
-  # - github.com/molecule-ai/molecule-mcp-server (npm CI)
-  # - github.com/molecule-ai/molecule-sdk-python (PyPI CI)
+  # - github.com/Molecule-AI/molecule-mcp-server (npm CI)
+  # - github.com/Molecule-AI/molecule-sdk-python (PyPI CI)

  # e2e-api job moved to .github/workflows/e2e-api.yml (issue #458).
  # It now has workflow-level concurrency (cancel-in-progress: false) so
@@ -440,5 +434,5 @@ jobs:
          fi

      # SDK + plugin validation moved to standalone repo:
-      # github.com/molecule-ai/molecule-sdk-python
+      # github.com/Molecule-AI/molecule-sdk-python

@@ -1,92 +1,36 @@
 name: CodeQL

-# Stub workflow — CodeQL Action is structurally incompatible with Gitea
-# Actions (post-2026-05-06 SCM migration off GitHub).
+# Controls CodeQL scan triggers for this repo.
 #
-# Why this is a stub, not a real CodeQL run:
+# GitHub's "Code quality" default setup (the UI-configured one) is
+# hardcoded to only scan the default branch — on this repo that's
+# `staging`, so PRs promoting staging→main would otherwise never be
+# scanned. This workflow fills that gap by explicitly scanning both
+# branches on push and PR.
 #
-# 1. github/codeql-action/init@v4 hits api.github.com endpoints
-#    (CodeQL CLI bundle download + query-pack registry + telemetry)
-#    that Gitea 1.22.x does NOT proxy. The act_runner has
-#    GITHUB_SERVER_URL=https://git.moleculesai.app correctly set
-#    (per saved memory feedback_act_runner_github_server_url and
-#    /config.yaml on the operator host), but the Gitea API surface
-#    simply does not implement the codeql-action bundle endpoints.
-#    Observed in run 1d/3101 (2026-05-07): "::error::404 page not
-#    found" inside the Initialize CodeQL step, before any analysis.
-#
-# 2. PR #35 attempted to mark `continue-on-error: true` at the JOB
-#    level (correct YAML structure). Gitea 1.22.6 does NOT propagate
-#    job-level continue-on-error to the commit-status API — every
-#    matrix leg still posts `failure` to the status surface, which
-#    keeps OVERALL=failure on every push to main + staging and
-#    blocks visual auto-promote signals (#156).
-#
-# 3. Hongming policy decision (2026-05-07, task #156): CodeQL is
-#    ADVISORY, not blocking, on Gitea Actions. We do not block PR
-#    merge or staging→main promotion on CodeQL findings until we
-#    have a Gitea-compatible static-analysis pipeline.
-#
-# What this stub preserves:
-#
-# - Workflow name `CodeQL` (referenced by auto-promote-staging.yml
-#   line 67 as a workflow_run gate — must stay stable).
-# - Job name template `Analyze (${{ matrix.language }})` and the
-#   3-leg matrix (go, javascript-typescript, python). Branch
-#   protection / required-check parity (#144) keys on these
-#   exact context names.
-# - merge_group + push + pull_request + schedule triggers, so the
-#   merge-queue check name still resolves (per saved memory
-#   feedback_branch_protection_check_name_parity).
-#
-# Re-enabling real analysis (future work):
-#
-# - Option A: self-hosted Semgrep / OpenGrep via a custom action
-#   that doesn't hit api.github.com. Tracked behind #156 follow-up.
-# - Option B: Sonatype Nexus IQ or similar, called from a step
-#   that uses the Gitea-issued token only.
-# - Option C: re-host this workflow on a small GitHub mirror used
-#   ONLY for SAST (push-mirrored from Gitea). Acceptable trade-off
-#   if/when payment is restored on a non-suspended GitHub org —
-#   but per saved memory feedback_no_single_source_of_truth, we
-#   should design for multi-vendor backup, not GitHub-only SAST.
-#
-# Until one of those lands, this stub keeps commit-status green so
-# the auto-promote chain isn't permanently red on a tool we cannot
-# actually run.
-#
-# Security policy: ADVISORY. We accept the residual risk of un-scanned
-# pushes during this window. Compensating controls in place:
-#   - secret-scan.yml runs on every push (active, blocks on hits)
-#   - block-internal-paths.yml blocks forbidden file paths
-#   - lint-curl-status-capture.yml catches one specific class of bug
-#   - branch-protection-drift.yml + the merge_group required-checks
-#     parity keep the gate surface stable
-# These are not equivalent to CodeQL coverage. Status of the
-# replacement plan is tracked in #156.
+# Runs on ubuntu-latest (GHA-hosted — public repo, free). GHAS is NOT
+# enabled on this repo, so results are not uploaded to the Security
+# tab — the scan fails the PR check on findings, and the SARIF is
+# kept as a workflow artifact for triage.

 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
-  # Required so the matrix legs emit a real result on the queued
-  # commit instead of a false-green when merge queue is enabled.
-  # Per saved memory feedback_branch_protection_check_name_parity:
-  # path-filtered / matrix workflows MUST emit the protected name
-  # via a job that always runs.
+  # GitHub merge queue fires `merge_group` for the queue's pre-merge CI run.
+  # Required so CodeQL Analyze checks get a real result on the queued
+  # commit instead of a false-green. Event only fires once merge queue is
+  # enabled on the target branch — safe to add unconditionally.
  merge_group:
    types: [checks_requested]
  schedule:
-    # Weekly heartbeat. Cheap on a stub (the no-op job is ~5s) but
-    # keeps the workflow visible in Gitea's Actions UI so the next
-    # operator notices it's a stub instead of a missing surface.
+    # Weekly run picks up findings in code that hasn't been touched.
    - cron: '30 1 * * 0'

-# Workflow-level concurrency: only one stub run per branch/PR at a
-# time. cancel-in-progress: false because a quick follow-up push
-# shouldn't kill an in-flight run — even though the stub is fast,
-# the contract should match a real CodeQL run for when we re-enable.
+# Workflow-level concurrency: only one CodeQL run per branch/PR at a time.
+# `cancel-in-progress: false` queues new runs so a quick follow-up push
+# doesn't nuke a 45-min analysis mid-flight.
 concurrency:
  group: codeql-${{ github.ref }}
  cancel-in-progress: false
@@ -94,17 +38,13 @@ concurrency:
 permissions:
  actions: read
  contents: read
-  # No security-events: write — we don't call the upload API anyway,
-  # GHAS isn't on Gitea.
+  # No security-events: write — we don't call the upload API.

 jobs:
  analyze:
-    # Job NAME shape is load-bearing — auto-promote-staging.yml +
-    # branch protection both key on `Analyze (${{ matrix.language }})`.
-    # Do NOT rename without coordinating both surfaces.
    name: Analyze (${{ matrix.language }})
    runs-on: ubuntu-latest
-    timeout-minutes: 5
+    timeout-minutes: 45

    strategy:
      fail-fast: false
@@ -112,25 +52,68 @@ jobs:
        language: [go, javascript-typescript, python]

    steps:
-      # Single-step stub: log the policy decision + emit success.
-      # Exit 0 explicitly so the commit-status API records `success`
-      # for each of the three matrix legs.
-      - name: CodeQL stub (advisory, non-blocking on Gitea)
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
+      # plugin was dropped + the Dockerfile no longer needs it.
+      # jq is pre-installed on ubuntu-latest — no setup step needed.
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+        with:
+          languages: ${{ matrix.language }}
+          # security-extended widens past the default to include the
+          # full security-query set for a public SaaS surface.
+          queries: security-extended
+
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+
+      - name: Perform CodeQL Analysis
+        id: analyze
+        uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
+        with:
+          category: "/language:${{ matrix.language }}"
+          # upload: never — GHAS isn't enabled on this repo, so the
+          # upload API 403s. Write SARIF locally instead.
+          upload: never
+          output: sarif-results/${{ matrix.language }}
+
+      - name: Parse SARIF + fail on findings
+        # The analyze step writes <database>.sarif into the output
+        # directory — database name is the short CodeQL lang id, not
+        # the matrix value (e.g. "javascript-typescript" →
+        # javascript.sarif), so glob rather than hardcode.
+        # Filter to error/warning severity: security-extended emits
+        # "note" rows for informational findings we don't want to fail
+        # the build over.
        shell: bash
        run: |
          set -euo pipefail
-          cat <<EOF
-          CodeQL is currently ADVISORY on Gitea Actions (post-2026-05-06).
-          Language matrix leg: ${{ matrix.language }}
-          Reason: github/codeql-action/init@v4 calls api.github.com
-                  bundle endpoints that Gitea 1.22.x does not implement.
-                  Observed: "::error::404 page not found" in the Init
-                  CodeQL step on every prior run.
-          Policy: per Hongming decision 2026-05-07 (#156), CodeQL is
-                  non-blocking until a Gitea-compatible SAST pipeline
-                  lands. See workflow file header for replacement
-                  options + compensating controls.
-          Status: emitting success so auto-promote isn't permanently
-                  red on a tool we cannot actually run today.
-          EOF
-          echo "::notice::CodeQL ${{ matrix.language }} — advisory stub, success."
+          dir="sarif-results/${{ matrix.language }}"
+          sarif=$(ls "$dir"/*.sarif 2>/dev/null | head -1 || true)
+          if [ -z "$sarif" ] || [ ! -f "$sarif" ]; then
+            echo "::error::No SARIF file found under $dir"
+            ls -la "$dir" 2>/dev/null || true
+            exit 1
+          fi
+          echo "Parsing $sarif"
+          count=$(jq '[.runs[].results[] | select(.level == "error" or .level == "warning")] | length' "$sarif")
+          echo "CodeQL findings (error+warning) for ${{ matrix.language }}: $count"
+          if [ "$count" -gt 0 ]; then
+            echo "::error::CodeQL found $count issues. Details below; full SARIF in the artifact."
+            jq -r '.runs[].results[] | select(.level == "error" or .level == "warning") | "  - [\(.level)] \(.ruleId // "?"): \(.message.text // "(no message)") @ \(.locations[0].physicalLocation.artifactLocation.uri // "?"):\(.locations[0].physicalLocation.region.startLine // "?")"' "$sarif"
+            exit 1
+          fi
+
+      - name: Upload SARIF artifact
+        # Keep SARIF around on success + failure so triagers can diff.
+        # 14-day retention — longer than default 3, short enough not
+        # to bloat quota.
+        if: always()
+        uses: actions/upload-artifact@v3 # pinned to v3 for Gitea act_runner v0.6 compatibility (internal#46)
+        with:
+          name: codeql-sarif-${{ matrix.language }}
+          path: sarif-results/${{ matrix.language }}/
+          retention-days: 14
@@ -12,59 +12,6 @@ name: E2E API Smoke Test
 # spending CI cycles. See the in-job comment on the `e2e-api` job for
 # why this is one job (not two-jobs-sharing-name) and the 2026-04-29
 # PR #2264 incident that drove the consolidation.
-#
-# Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
-# -------------------------------------------------------------------
-# Same substrate hazard as PR #98 (handlers-postgres-integration). Our
-# Gitea act_runner runs with `container.network: host` (operator host
-# `/opt/molecule/runners/config.yaml`), which means:
-#
-#   * Two concurrent runs both try to bind their `-p 15432:5432` /
-#     `-p 16379:6379` host ports — the second postgres/redis FATALs
-#     with `Address in use` and `docker run` returns exit 125 with
-#     `Conflict. The container name "/molecule-ci-postgres" is already
-#     in use by container ...`. Verified in run a7/2727 on 2026-05-07.
-#   * The fixed container names `molecule-ci-postgres` / `-redis` (the
-#     pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
-#     `docker rm -f` at the start of the second job KILLS the first
-#     job's still-running postgres/redis.
-#
-# Fix shape (mirrors PR #98's bridge-net pattern, adapted because
-# platform-server is a Go binary on the host, not a containerised
-# step):
-#
-#   1. Unique container names per run:
-#         pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
-#         redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
-#      `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
-#      same run_id.
-#   2. Ephemeral host port per run (`-p 0:5432`), then read the actual
-#      bound port via `docker port` and export DATABASE_URL/REDIS_URL
-#      pointing at it. No fixed host-port → no port collision.
-#   3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
-#      the original flake fixed in #92 and the script's still IPv6-
-#      enabled.
-#   4. `if: always()` cleanup so containers don't leak when test steps
-#      fail.
-#
-# Issue #94 items #2 + #3 (also fixed here):
-#   * Pre-pull `alpine:latest` so the platform-server's provisioner
-#     (`internal/handlers/container_files.go`) can stand up its
-#     ephemeral token-write helper without a daemon.io round-trip.
-#   * Create `molecule-monorepo-net` bridge network if missing so the
-#     provisioner's container.HostConfig {NetworkMode: ...} attach
-#     succeeds.
-# Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
-# 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
-# they DO come up. Timeouts are not the bottleneck; not bumped.
-#
-# Item explicitly NOT fixed here: failing test `Status back online`
-# fails because the platform's langgraph workspace template image
-# (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
-# 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
-# template-registry resolution issue (ADR-002 / local-build mode) and
-# belongs in a separate change that touches workspace-server, not
-# this workflow file.

 on:
  push:
@@ -131,14 +78,11 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 15
    env:
-      # Unique per-run container names so concurrent runs on the host-
-      # network act_runner don't collide on name OR port.
-      # `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
-      # same run_id. PORT is set later (after docker port lookup) since
-      # we let Docker assign an ephemeral host port.
-      PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
-      REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
+      DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable
+      REDIS_URL: redis://localhost:16379
      PORT: "8080"
+      PG_CONTAINER: molecule-ci-postgres
+      REDIS_CONTAINER: molecule-ci-redis
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.api != 'true'
@@ -153,53 +97,11 @@ jobs:
          go-version: 'stable'
          cache: true
          cache-dependency-path: workspace-server/go.sum
-      - name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
-        if: needs.detect-changes.outputs.api == 'true'
-        run: |
-          # Provisioner uses alpine:latest for ephemeral token-write
-          # containers (workspace-server/internal/handlers/container_files.go).
-          # Pre-pull so the first provision in test_api.sh doesn't race
-          # the daemon's pull cache. Idempotent — `docker pull` is a no-op
-          # when the image is already present.
-          docker pull alpine:latest >/dev/null
-          # Provisioner attaches workspace containers to
-          # molecule-monorepo-net (workspace-server/internal/provisioner/
-          # provisioner.go::DefaultNetwork). The bridge already exists on
-          # the operator host's docker daemon — `network create` is
-          # idempotent via `|| true`.
-          docker network create molecule-monorepo-net >/dev/null 2>&1 || true
-          echo "alpine:latest pre-pulled; molecule-monorepo-net ensured."
      - name: Start Postgres (docker)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
-          # Defensive cleanup — only matches THIS run's container name,
-          # so it cannot kill a sibling run's postgres. (Pre-fix the
-          # name was static and this rm hit other runs' containers.)
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
-          # `-p 0:5432` requests an ephemeral host port; we read it back
-          # below and export DATABASE_URL.
-          docker run -d --name "$PG_CONTAINER" \
-            -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
-            -p 0:5432 postgres:16 >/dev/null
-          # Resolve the host-side port assignment. `docker port` prints
-          # `0.0.0.0:NNNN` (and on host-net runners may also print an
-          # IPv6 line — take the first IPv4 line).
-          PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
-          if [ -z "$PG_PORT" ]; then
-            # Fallback: any first line. Some Docker versions print only
-            # one line.
-            PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
-          fi
-          if [ -z "$PG_PORT" ]; then
-            echo "::error::Could not resolve host port for $PG_CONTAINER"
-            docker port "$PG_CONTAINER" 5432/tcp || true
-            docker logs "$PG_CONTAINER" || true
-            exit 1
-          fi
-          # 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
-          echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
-          echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
-          echo "Postgres host port: ${PG_PORT}"
+          docker run -d --name "$PG_CONTAINER" -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule -p 15432:5432 postgres:16
          for i in $(seq 1 30); do
            if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
              echo "Postgres ready after ${i}s"
@@ -214,20 +116,7 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
-          docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
-          REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
-          if [ -z "$REDIS_PORT" ]; then
-            REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
-          fi
-          if [ -z "$REDIS_PORT" ]; then
-            echo "::error::Could not resolve host port for $REDIS_CONTAINER"
-            docker port "$REDIS_CONTAINER" 6379/tcp || true
-            docker logs "$REDIS_CONTAINER" || true
-            exit 1
-          fi
-          echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
-          echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
-          echo "Redis host port: ${REDIS_PORT}"
+          docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7
          for i in $(seq 1 15); do
            if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
              echo "Redis ready after ${i}s"
@@ -246,15 +135,13 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: |
-          # DATABASE_URL + REDIS_URL exported by the start-postgres /
-          # start-redis steps point at this run's per-run host ports.
          ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid
      - name: Wait for /health
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          for i in $(seq 1 30); do
-            if curl -sf http://127.0.0.1:8080/health > /dev/null; then
+            if curl -sf http://localhost:8080/health > /dev/null; then
              echo "Platform up after ${i}s"
              exit 0
            fi
@@ -298,9 +185,6 @@ jobs:
            kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
          fi
      - name: Stop service containers
-        # always() so containers don't leak when test steps fail. The
-        # cleanup is best-effort: if the container is already gone
-        # (e.g. concurrent rerun race), don't fail the job.
        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
@@ -139,11 +139,7 @@ jobs:

      - name: Upload Playwright report on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
-        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
-        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
-        # implement (see ci.yml upload step for the canonical error
-        # cite). Drop this pin when Gitea ships the v4 protocol.
-        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
+        uses: actions/upload-artifact@v3 # pinned to v3 for Gitea act_runner v0.6 compatibility (internal#46)
        with:
          name: playwright-report-staging
          path: canvas/playwright-report-staging/
@@ -151,8 +147,7 @@ jobs:

      - name: Upload screenshots on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
-        # Pinned to v3 for Gitea act_runner v0.6 compatibility (see above).
-        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
+        uses: actions/upload-artifact@v3 # pinned to v3 for Gitea act_runner v0.6 compatibility (internal#46)
        with:
          name: playwright-screenshots
          path: canvas/test-results/
@@ -14,42 +14,12 @@ name: Handlers Postgres Integration
 # self-review caught it took 2 minutes to set up and would have caught
 # the bug at PR-time.
 #
-# Why this workflow does NOT use `services: postgres:` (Class B fix)
-# ------------------------------------------------------------------
-# Our act_runner config has `container.network: host` (operator host
-# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
-# the job container AND every service container. With host-net, two
-# concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
-# second postgres FATALs with `could not create any TCP/IP sockets:
-# Address in use`, and Docker auto-removes it (act_runner sets
-# AutoRemove:true on service containers). By the time the migrations
-# step runs `psql`, the postgres container is gone, hence
-# `Connection refused` then `failed to remove container: No such
-# container` at cleanup time.
+# This job spins a Postgres service container, applies the migration,
+# and runs `go test -tags=integration` against a live DB. Required
+# check on staging branch protection — backend handler PRs cannot
+# merge without a real-DB regression gate.
 #
-# Per-job `container.network` override is silently ignored by
-# act_runner — `--network and --net in the options will be ignored.`
-# appears in the runner log. Documented constraint.
-#
-# So we sidestep `services:` entirely. The job container still uses
-# host-net (inherited from runner config; required for cache server
-# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
-# postgres on the existing `molecule-monorepo-net` bridge with a
-# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
-# read its bridge IP via `docker inspect`. A host-net job container
-# can reach a bridge-net container directly via the bridge IP (verified
-# manually on operator host 2026-05-08).
-#
-# Trade-offs vs. the original `services:` shape:
-#   + No host-port collision; N parallel runs share the bridge cleanly
-#   + `if: always()` cleanup runs even on test-step failure
-#   - One more step in the workflow (+~3 lines)
-#   - Requires `molecule-monorepo-net` to exist on the operator host
-#     (it does; declared in docker-compose.yml + docker-compose.infra.yml)
-#
-# Class B Hongming-owned CICD red sweep, 2026-05-08.
-#
-# Cost: ~30s job (postgres pull from cache + go build + 4 tests).
+# Cost: ~30s job (postgres pull from GH cache + go build + 4 tests).

 on:
  push:
@@ -89,14 +59,20 @@ jobs:
    name: Handlers Postgres Integration
    needs: detect-changes
    runs-on: ubuntu-latest
-    env:
-      # Unique name per run so concurrent jobs don't collide on the
-      # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
-      # workflow_dispatch reruns of the same run_id.
-      PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
-      # Bridge network already exists on the operator host (declared
-      # in docker-compose.yml + docker-compose.infra.yml).
-      PG_NETWORK: molecule-monorepo-net
+    services:
+      postgres:
+        image: postgres:15-alpine
+        env:
+          POSTGRES_PASSWORD: test
+          POSTGRES_DB: molecule
+        ports:
+          - 5432:5432
+        # GHA spins this with --health-cmd built in for postgres images.
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 5s
+          --health-timeout 5s
+          --health-retries 10
    defaults:
      run:
        working-directory: workspace-server
@@ -113,57 +89,16 @@ jobs:
        with:
          go-version: 'stable'

-      - if: needs.detect-changes.outputs.handlers == 'true'
-        name: Start sibling Postgres on bridge network
-        working-directory: .
-        run: |
-          # Sanity: the bridge network must exist on the operator host.
-          # Hard-fail loud if it doesn't — easier to spot than a silent
-          # auto-create that diverges from the rest of the stack.
-          if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
-            echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
-            exit 1
-          fi
-
-          # If a stale container with the same name exists (rerun on
-          # the same run_id), wipe it first.
-          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
-
-          docker run -d \
-            --name "${PG_NAME}" \
-            --network "${PG_NETWORK}" \
-            --health-cmd "pg_isready -U postgres" \
-            --health-interval 5s \
-            --health-timeout 5s \
-            --health-retries 10 \
-            -e POSTGRES_PASSWORD=test \
-            -e POSTGRES_DB=molecule \
-            postgres:15-alpine >/dev/null
-
-          # Read back the bridge IP. Always present immediately after
-          # `docker run -d` for bridge networks.
-          PG_HOST=$(docker inspect "${PG_NAME}" \
-            --format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
-          if [ -z "${PG_HOST}" ]; then
-            echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
-            docker logs "${PG_NAME}" || true
-            exit 1
-          fi
-          echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
-          echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
-          echo "Started ${PG_NAME} at ${PG_HOST}:5432"
-
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Apply migrations to Postgres service
        env:
          PGPASSWORD: test
        run: |
-          # Wait for postgres to actually accept connections. Docker's
-          # health-cmd handles container-side readiness, but the wire
-          # to the bridge IP is best-tested with pg_isready directly.
+          # Wait for postgres to actually accept connections (the
+          # GHA --health-cmd is best-effort but psql can still race).
          for i in {1..15}; do
-            if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
-            echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
+            if pg_isready -h localhost -p 5432 -U postgres -q; then break; fi
+            echo "waiting for postgres..."; sleep 2
          done

          # Apply every .up.sql in lexicographic order with
@@ -196,7 +131,7 @@ jobs:
          # not fine once a cross-table atomicity test came in.
          set +e
          for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
-            if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
+            if psql -h localhost -U postgres -d molecule -v ON_ERROR_STOP=1 \
                  -f "$migration" >/dev/null 2>&1; then
              echo "✓ $(basename "$migration")"
            else
@@ -210,7 +145,7 @@ jobs:
          # fail if any didn't land — that would be a real regression we
          # want loud.
          for tbl in delegations workspaces activity_logs pending_uploads; do
-            if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
+            if ! psql -h localhost -U postgres -d molecule -tA \
                -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
                | grep -q 1; then
              echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
@@ -221,32 +156,16 @@ jobs:

      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Run integration tests
+        env:
+          INTEGRATION_DB_URL: postgres://postgres:test@localhost:5432/molecule?sslmode=disable
        run: |
-          # INTEGRATION_DB_URL is exported by the start-postgres step;
-          # points at the per-run bridge IP, not 127.0.0.1, so concurrent
-          # workflow runs don't fight over a host-net 5432 port.
          go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"

-      - if: failure() && needs.detect-changes.outputs.handlers == 'true'
+      - if: needs.detect-changes.outputs.handlers == 'true' && failure()
        name: Diagnostic dump on failure
        env:
          PGPASSWORD: test
        run: |
-          echo "::group::postgres container status"
-          docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
-          docker logs "${PG_NAME}" 2>&1 | tail -50 || true
-          echo "::endgroup::"
          echo "::group::delegations table state"
-          psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
+          psql -h localhost -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
          echo "::endgroup::"
-
-      - if: always() && needs.detect-changes.outputs.handlers == 'true'
-        name: Stop sibling Postgres
-        working-directory: .
-        run: |
-          # always() so containers don't leak when migrations or tests
-          # fail. The cleanup is best-effort: if the container is
-          # already gone (e.g. concurrent rerun race), don't fail the job.
-          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
-          echo "Cleaned up ${PG_NAME}"
-
@@ -98,55 +98,6 @@ jobs:
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # the plugin was dropped + Dockerfile.tenant no longer COPYs it.

-      # Pre-clone manifest deps before docker compose builds the tenant
-      # image (Task #173 followup — same pattern as
-      # publish-workspace-server-image.yml's "Pre-clone manifest deps"
-      # step).
-      #
-      # Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha
-      # and tenant-beta from workspace-server/Dockerfile.tenant with
-      # context=../.. (repo root). That Dockerfile expects
-      # .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins}
-      # to be present at build context root (post-#173 it COPYs from there
-      # instead of running an in-image clone — the in-image clone failed
-      # with "could not read Username for https://git.moleculesai.app"
-      # because there's no auth path inside the build sandbox).
-      #
-      # Without this step harness-replays fails before any replay runs,
-      # with `failed to calculate checksum of ref ...
-      # "/.tenant-bundle-deps/plugins": not found`. Caught by run #892
-      # (main, 2026-05-07T20:28:53Z) and run #964 (staging — same
-      # symptom, different root cause: staging still has the in-image
-      # clone path, hits the auth error directly).
-      #
-      # Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
-      # is the devops-engineer persona PAT, NOT the founder PAT (per
-      # `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
-      # embeds it as basic-auth for the duration of the clones and strips
-      # .git directories — the token never enters the resulting image.
-      - name: Pre-clone manifest deps
-        if: needs.detect-changes.outputs.run == 'true'
-        env:
-          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
-        run: |
-          set -euo pipefail
-          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
-            echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
-            exit 1
-          fi
-          mkdir -p .tenant-bundle-deps
-          bash scripts/clone-manifest.sh \
-            manifest.json \
-            .tenant-bundle-deps/workspace-configs-templates \
-            .tenant-bundle-deps/org-templates \
-            .tenant-bundle-deps/plugins
-          # Sanity-check counts so a silent partial clone fails fast
-          # instead of producing a half-empty image.
-          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
-          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
-          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
-          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
-
      - name: Install Python deps for replays
        # peer-discovery-404 (and future replays) eval Python against the
        # running tenant — importing workspace/a2a_client.py pulls in
@@ -19,4 +19,4 @@ permissions:

 jobs:
  disable-auto-merge-on-push:
-    uses: molecule-ai/molecule-ci/.github/workflows/disable-auto-merge-on-push.yml@main
+    uses: Molecule-AI/molecule-ci/.github/workflows/disable-auto-merge-on-push.yml@main
@@ -25,7 +25,7 @@ name: publish-runtime
 #   3. Publishes to PyPI via the PyPA Trusted Publisher action (OIDC).
 #      No static API token is stored — PyPI verifies the workflow's
 #      OIDC claim against the trusted-publisher config registered for
-#      molecule-ai-workspace-runtime (molecule-ai/molecule-core,
+#      molecule-ai-workspace-runtime (Molecule-AI/molecule-core,
 #      publish-runtime.yml, environment pypi-publish).
 #
 # After publish: the 8 template repos pick up the new version on their
@@ -166,7 +166,7 @@ jobs:

      - name: Publish to PyPI (Trusted Publisher / OIDC)
        # PyPI side is configured: project molecule-ai-workspace-runtime →
-        # publisher molecule-ai/molecule-core, workflow publish-runtime.yml,
+        # publisher Molecule-AI/molecule-core, workflow publish-runtime.yml,
        # environment pypi-publish. The action mints a short-lived OIDC
        # token and exchanges it for a PyPI upload credential — no static
        # API token in this repo's secrets.
@@ -282,42 +282,33 @@ jobs:
          echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces."
          exit 1

-      - name: Fan out repository_dispatch
+      - name: Fan out via push to .runtime-version
        env:
-          # Fine-grained PAT with `actions:write` on the 8 template repos.
-          # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
-          # token. Stored as a repo secret; rotate per the standard schedule.
-          DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
-          # Single source of truth: the publish job's output, which handles
-          # tag/manual-input/auto-bump uniformly. The previous fallback
-          # (`steps.version.outputs.version` from inside the cascade job)
-          # was a dead reference — different job, no shared step scope.
+          # Gitea PAT with write:repository scope on the 8 cascade-active
+          # template repos. Used here for `git push` (NOT for an API
+          # dispatch — Gitea 1.22.6 has no repository_dispatch endpoint;
+          # empirically verified across 6 candidate paths in molecule-
+          # core#20 issuecomment-913). The push trips each template's
+          # existing `on: push: branches: [main]` trigger on
+          # publish-image.yml, which then reads the updated
+          # .runtime-version via its resolve-version job.
+          DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
        run: |
          set +e   # don't abort on a single repo failure — collect them all
-          # Schedule-vs-dispatch behaviour split (hardened 2026-04-28
-          # after the sweep-cf-orphans soft-skip incident — same class
-          # of bug):
-          #
-          # The earlier "skipping cascade. templates will pick up the
-          # new version on their own next rebuild" message was wrong —
-          # templates only build on this dispatch trigger; without it
-          # they stay pinned to whatever runtime version they last saw.
-          # A silent skip here means "PyPI is current, templates are
-          # not" and the gap is invisible until someone notices a
-          # template still on the old version weeks later.
-          #
-          #   - push                → exit 1 (red CI surfaces the gap)
-          #   - workflow_dispatch   → exit 0 with a warning (operator
-          #                           ran this ad-hoc; let them rerun
-          #                           after fixing the secret)
+
+          # Soft-skip on workflow_dispatch when the token is missing
+          # (operator ad-hoc test); hard-fail on push so unattended
+          # publishes can't silently skip the cascade. Same shape as
+          # the original v1, intentional split per the schedule-vs-
+          # dispatch hardening 2026-04-28.
          if [ -z "$DISPATCH_TOKEN" ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-              echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade."
+              echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
              echo "::warning::set it at Settings → Secrets and Variables → Actions, then rerun. Templates will stay on the prior runtime version until either this token is set or each template is rebuilt manually."
              exit 0
            fi
-            echo "::error::TEMPLATE_DISPATCH_TOKEN secret missing — cascade cannot fan out."
+            echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
            echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version until this token is restored and a republish dispatches the cascade."
            echo "::error::set it at Settings → Secrets and Variables → Actions; then re-trigger publish-runtime via workflow_dispatch."
            exit 1
@@ -327,37 +318,119 @@ jobs:
            echo "::error::publish job did not expose a version output — cascade cannot fan out"
            exit 1
          fi
-          # All 9 active workspace template repos. The PR #2536 pruning
-          # ("deprecated, no shipping images") was empirically wrong:
-          # continuous-synth-e2e.yml defaults to langgraph as its primary
-          # canary (line 44), and every excluded template had successful
-          # publish-image runs as of 2026-05-03 — none were dormant.
-          # Symptom of the prune: today's a2a-sdk strict-mode fix
-          # (#2566 / commit e1628c4) cascaded to 4 templates but never
-          # reached langgraph, so the synth-E2E correctly canary'd a fix
-          # that had landed but not deployed. Re-added the 5 templates.
-          # Long-term: derive this list from manifest.json so cascade
-          # scope can't drift from E2E scope — tracked in RFC #388 as a
-          # Phase-1 invariant.
+
+          # All 9 workspace templates declared in manifest.json. The list
+          # MUST stay aligned with manifest.json's workspace_templates —
+          # cascade-list-drift-gate.yml enforces this in CI per the
+          # codex-stuck-on-stale-runtime invariant from PR #2556.
+          # Long-term goal: derive this list from manifest.json so it
+          # can't drift even on a manifest edit (RFC #388 Phase-1).
+          #
+          # Per-template publish-image.yml presence is checked at
+          # cascade-time below: codex doesn't ship one today, so the
+          # cascade soft-skips it with an informational message rather
+          # than dropping it from this list (which would re-introduce
+          # the drift the gate exists to catch).
+          GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
          TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
          FAILED=""
+          SKIPPED=""
+
+          # Configure git identity once. The persona owning DISPATCH_TOKEN
+          # is the same identity that authored this commit on each
+          # template; using a generic "publish-runtime cascade" co-author
+          # trailer in the message keeps the audit trail honest about the
+          # workflow-driven origin.
+          git config --global user.name  "publish-runtime cascade"
+          git config --global user.email "publish-runtime@moleculesai.app"
+
+          WORKDIR="$(mktemp -d)"
          for tpl in $TEMPLATES; do
            REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
-            STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
-              -X POST "https://api.github.com/repos/$REPO/dispatches" \
-              -H "Authorization: Bearer $DISPATCH_TOKEN" \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" \
-              -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
-            if [ "$STATUS" = "204" ]; then
-              echo "✓ dispatched $tpl ($VERSION)"
-            else
-              echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
+            CLONE="$WORKDIR/$tpl"
+
+            # Pre-check: skip templates without a publish-image.yml.
+            # The cascade's job is to trip the template's on-push
+            # rebuild — if there's no rebuild workflow, pushing a
+            # .runtime-version commit is just noise on the target
+            # repo. Use the Gitea contents API (no clone required for
+            # the probe). 200 = present; 404 = absent.
+            HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
+              -H "Authorization: token $DISPATCH_TOKEN" \
+              "$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
+            if [ "$HTTP" = "404" ]; then
+              echo "↷ $tpl has no publish-image.yml — soft-skip (informational; manifest still tracks it)"
+              SKIPPED="$SKIPPED $tpl"
+              continue
+            fi
+            if [ "$HTTP" != "200" ]; then
+              echo "::warning::$tpl publish-image.yml probe returned HTTP $HTTP — proceeding anyway, push will surface the real failure if any"
+            fi
+
+            # Use a per-template attempt loop so a transient race (e.g.
+            # human pushing to the same template at the same instant)
+            # doesn't lose the cascade. Bounded retries (3) — beyond
+            # that we surface the failure and let the operator retry.
+            attempt=0
+            success=false
+            while [ $attempt -lt 3 ]; do
+              attempt=$((attempt + 1))
+              rm -rf "$CLONE"
+              if ! git clone --depth=1 \
+                  "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
+                  "$CLONE" >/tmp/clone.log 2>&1; then
+                echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
+                sleep 2
+                continue
+              fi
+
+              cd "$CLONE"
+              echo "$VERSION" > .runtime-version
+
+              # Idempotency guard: if the file already matches, this
+              # publish is a re-run for a version already cascaded.
+              # Don't push a no-op commit (would spuriously re-trip the
+              # template's on-push and rebuild for nothing).
+              if git diff --quiet -- .runtime-version; then
+                echo "✓ $tpl already at $VERSION — no commit needed (idempotent)"
+                success=true
+                cd - >/dev/null
+                break
+              fi
+
+              git add .runtime-version
+              git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
+                -m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
+                >/dev/null
+
+              if git push origin HEAD:main >/tmp/push.log 2>&1; then
+                echo "✓ $tpl pushed $VERSION on attempt $attempt"
+                success=true
+                cd - >/dev/null
+                break
+              fi
+
+              # Likely a non-fast-forward — pull-rebase and retry.
+              # Don't force-push: that would silently overwrite a racing
+              # human/cascade commit.
+              echo "::warning::push $tpl attempt $attempt failed, pull-rebasing: $(tail -n3 /tmp/push.log)"
+              git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
+              cd - >/dev/null
+            done
+
+            if [ "$success" != "true" ]; then
              FAILED="$FAILED $tpl"
            fi
          done
+          rm -rf "$WORKDIR"
+
          if [ -n "$FAILED" ]; then
-            echo "::warning::Cascade incomplete. Failed templates:$FAILED"
-            # Don't fail the whole job — PyPI publish already succeeded;
-            # operators can retry the failed templates manually.
+            echo "::error::Cascade incomplete after 3 retries each. Failed templates:$FAILED"
+            echo "::error::PyPI publish succeeded; failed templates lag the new version. Re-run this workflow_dispatch with the same version to retry only the laggers (idempotent — already-cascaded templates skip)."
+            exit 1
+          fi
+          if [ -n "$SKIPPED" ]; then
+            echo "Cascade complete: pinned $VERSION on cascade-active templates. Soft-skipped (no publish-image.yml):$SKIPPED"
+          else
+            echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
          fi
@@ -37,7 +37,6 @@ on:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
-      - 'scripts/**'
      - '.github/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:

@@ -75,87 +74,33 @@ jobs:
      # plugin was dropped + workspace-server/Dockerfile no longer
      # COPYs it.

-      # ECR auth + buildx setup are now inline in each build step
-      # below (Task #173, 2026-05-07).
-      #
-      # Why moved inline: aws-actions/configure-aws-credentials@v4 +
-      # aws-actions/amazon-ecr-login@v2 + docker/setup-buildx-action
-      # all left auth state in places that the actual `docker push`
-      # couldn't see on Gitea Actions:
-      #   - The actions wrote to a step-scoped DOCKER_CONFIG path
-      #     that didn't survive into subsequent shell steps.
-      #   - Buildx couldn't bridge the runner container ↔
-      #     operator-host docker daemon auth gap (401 on the
-      #     docker-container driver, "no basic auth credentials"
-      #     with the action-driven login).
-      #
-      # Doing AWS+ECR auth inline (`aws ecr get-login-password |
-      # docker login`) in the same shell step as `docker build` +
-      # `docker push` is the operator-host manual approach, mapped
-      # 1:1 into CI. Auth state is guaranteed to live in the env that
-      # `docker push` actually runs from.
-      #
-      # Post-suspension target is the operator's ECR org
-      # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*),
-      # which already hosts platform-tenant + workspace-template-* +
-      # runner-base images. AWS creds come from the
-      # AWS_ACCESS_KEY_ID/SECRET secrets bound to the molecule-cp
-      # IAM user. Closes #161.
+      - name: Configure AWS credentials for ECR
+        # GHCR was the pre-suspension target; the molecule-ai org on
+        # GitHub got swept 2026-05-06 and ghcr.io/molecule-ai/* is no
+        # longer reachable. Post-suspension target is the operator's
+        # ECR org (153263036946.dkr.ecr.us-east-2.amazonaws.com/
+        # molecule-ai/*), which already hosts platform-tenant +
+        # workspace-template-* + runner-base images. AWS creds come
+        # from the AWS_ACCESS_KEY_ID/SECRET secrets bound to the
+        # molecule-cp IAM user. Closes #161.
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
+
+      - name: Log in to ECR
+        id: ecr-login
+        uses: aws-actions/amazon-ecr-login@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0

      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

-      # Pre-clone manifest deps before docker build (Task #173 fix).
-      #
-      # Why pre-clone: post-2026-05-06, every workspace-template-* repo on
-      # Gitea (codex, crewai, deepagents, gemini-cli, langgraph) plus all
-      # 7 org-template-* repos are private. The pre-fix Dockerfile.tenant
-      # ran `git clone` inside an in-image stage, which had no auth path
-      # — every CI build failed with "fatal: could not read Username for
-      # https://git.moleculesai.app". For weeks, every workspace-server
-      # rebuild required a manual operator-host push. Now we clone in the
-      # trusted CI context (where AUTO_SYNC_TOKEN is naturally available)
-      # and Dockerfile.tenant just COPYs from .tenant-bundle-deps/.
-      #
-      # Token shape: AUTO_SYNC_TOKEN is the devops-engineer persona PAT
-      # (see /etc/molecule-bootstrap/agent-secrets.env). Per saved memory
-      # `feedback_per_agent_gitea_identity_default`, every CI surface uses
-      # a per-persona token, never the founder PAT. clone-manifest.sh
-      # embeds it as basic-auth (oauth2:<token>) for the duration of the
-      # clones, then strips .git directories — the token never enters
-      # the resulting image.
-      #
-      # Idempotent: if a re-run finds populated dirs, clone-manifest.sh
-      # skips them; safe to retrigger via path-filter or workflow_dispatch.
-      - name: Pre-clone manifest deps
-        env:
-          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
-        run: |
-          set -euo pipefail
-          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
-            echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
-            exit 1
-          fi
-          mkdir -p .tenant-bundle-deps
-          bash scripts/clone-manifest.sh \
-            manifest.json \
-            .tenant-bundle-deps/workspace-configs-templates \
-            .tenant-bundle-deps/org-templates \
-            .tenant-bundle-deps/plugins
-          # Sanity-check counts so a silent partial clone fails fast
-          # instead of producing a half-empty image.
-          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
-          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
-          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
-          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
-          # Counts are derived from manifest.json (9 ws / 7 org / 21
-          # plugins as of 2026-05-07). If manifest.json grows but the
-          # clone step regresses silently, the find above caps at the
-          # actual disk state — but clone-manifest.sh's own EXPECTED vs
-          # CLONED check (line ~95) is the authoritative fail-fast.
-
      # Canary-gated release flow:
      #   - This step always publishes :staging-<sha> + :staging-latest.
      #   - On staging push, staging-CP picks up :staging-latest immediately
@@ -181,82 +126,58 @@ jobs:
      # were running pre-RFC code. Adding the staging trigger above closes
      # that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
      # drifted 10 days behind staging — same class of bug, different
-      # mechanism. ECR repo molecule-ai/platform created 2026-05-07.
-      # Build + push platform image with plain `docker` (no buildx).
-      # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
-      # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
-      # The OCI revision label below carries the same value for registry
-      # tooling; the duplication is intentional.
-      - name: Build & push platform image to ECR (staging-<sha> + staging-latest)
-        env:
-          IMAGE_NAME: ${{ env.IMAGE_NAME }}
-          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
-          TAG_LATEST: staging-latest
-          GIT_SHA: ${{ github.sha }}
-          REPO: ${{ github.repository }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          AWS_DEFAULT_REGION: us-east-2
-        run: |
-          set -euo pipefail
-          # ECR auth in-step so config.json is populated in the same
-          # shell env that runs `docker push`. ECR get-login-password
-          # tokens last 12h, plenty for a single-step build+push.
-          ECR_REGISTRY="${IMAGE_NAME%%/*}"
-          aws ecr get-login-password --region us-east-2 | \
-            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
-          docker build \
-            --file ./workspace-server/Dockerfile \
-            --build-arg GIT_SHA="${GIT_SHA}" \
-            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
-            --label "org.opencontainers.image.revision=${GIT_SHA}" \
-            --label "org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify" \
-            --tag "${IMAGE_NAME}:${TAG_SHA}" \
-            --tag "${IMAGE_NAME}:${TAG_LATEST}" \
-            .
-          docker push "${IMAGE_NAME}:${TAG_SHA}"
-          docker push "${IMAGE_NAME}:${TAG_LATEST}"
-
-      # Canvas uses same-origin fetches. The tenant Go platform
-      # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
-      # env; the tenant's /canvas/viewport, /approvals/pending,
-      # /org/templates etc. live on the tenant platform itself.
-      # Both legs share one origin (the tenant subdomain) so
-      # PLATFORM_URL="" forces canvas to fetch paths as relative,
-      # which land same-origin.
-      #
-      # Self-hosted / private-label deployments override this at
-      # build time with a specific backend (e.g. local dev:
-      # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
-      - name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
-        env:
-          TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
-          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
-          TAG_LATEST: staging-latest
-          GIT_SHA: ${{ github.sha }}
-          REPO: ${{ github.repository }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          AWS_DEFAULT_REGION: us-east-2
-        run: |
-          set -euo pipefail
-          # Re-login: the platform-image step's docker login wrote to
-          # the same config.json, so this is technically redundant — but
-          # making each push step self-contained keeps the workflow
-          # robust to step reordering / future extraction.
-          ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
-          aws ecr get-login-password --region us-east-2 | \
-            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
-          docker build \
-            --file ./workspace-server/Dockerfile.tenant \
-            --build-arg NEXT_PUBLIC_PLATFORM_URL= \
-            --build-arg GIT_SHA="${GIT_SHA}" \
-            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
-            --label "org.opencontainers.image.revision=${GIT_SHA}" \
-            --label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \
-            --tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
-            --tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
-            .
-          docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}"
-          docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"
+      # mechanism.
+      - name: Build & push platform image to GHCR (staging-<sha> + staging-latest)
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        with:
+          context: .
+          file: ./workspace-server/Dockerfile
+          platforms: linux/amd64
+          push: true
+          tags: |
+            ${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
+            ${{ env.IMAGE_NAME }}:staging-latest
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
+          # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
+          # This is the same value as the OCI revision label below; passing
+          # it twice is intentional, the OCI label is for registry tooling
+          # while /buildinfo is for the redeploy verification step.
+          build-args: |
+            GIT_SHA=${{ github.sha }}
+          labels: |
+            org.opencontainers.image.source=https://github.com/${{ github.repository }}
+            org.opencontainers.image.revision=${{ github.sha }}
+            org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify

+      - name: Build & push tenant image to GHCR (staging-<sha> + staging-latest)
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        with:
+          context: .
+          file: ./workspace-server/Dockerfile.tenant
+          platforms: linux/amd64
+          push: true
+          tags: |
+            ${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
+            ${{ env.TENANT_IMAGE_NAME }}:staging-latest
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          # Canvas uses same-origin fetches. The tenant Go platform
+          # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
+          # env; the tenant's /canvas/viewport, /approvals/pending,
+          # /org/templates etc. live on the tenant platform itself.
+          # Both legs share one origin (the tenant subdomain) so
+          # PLATFORM_URL="" forces canvas to fetch paths as relative,
+          # which land same-origin.
+          #
+          # Self-hosted / private-label deployments override this at
+          # build time with a specific backend (e.g. local dev:
+          # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
+          build-args: |
+            NEXT_PUBLIC_PLATFORM_URL=
+            GIT_SHA=${{ github.sha }}
+          labels: |
+            org.opencontainers.image.source=https://github.com/${{ github.repository }}
+            org.opencontainers.image.revision=${{ github.sha }}
+            org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify
@@ -9,7 +9,7 @@ name: redeploy-tenants-on-main
 #
 # This workflow closes the gap by calling the control-plane admin
 # endpoint that performs a canary-first, batched, health-gated rolling
-# redeploy across every live tenant. Implemented in molecule-ai/
+# redeploy across every live tenant. Implemented in Molecule-AI/
 # molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
 # (feat/tenant-auto-redeploy, landing alongside this workflow).
 #
@@ -146,7 +146,7 @@ jobs:

      - name: Call CP redeploy-fleet
        # CP_ADMIN_API_TOKEN must be set as a repo/org secret on
-        # molecule-ai/molecule-core, matching the staging/prod CP's
+        # Molecule-AI/molecule-core, matching the staging/prod CP's
        # CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
        # repo's secrets for CI.
        env:
@@ -97,7 +97,7 @@ jobs:

      - name: Call staging-CP redeploy-fleet
        # CP_STAGING_ADMIN_API_TOKEN must be set as a repo/org secret
-        # on molecule-ai/molecule-core, matching staging-CP's
+        # on Molecule-AI/molecule-core, matching staging-CP's
        # CP_ADMIN_API_TOKEN env var (visible in Railway controlplane
        # / staging environment). Stored separately from the prod
        # CP_ADMIN_API_TOKEN so a leak of one doesn't auth the other.
@@ -1,99 +1,16 @@
 name: Retarget main PRs to staging

-# Mechanical enforcement of SHARED_RULES rule 8 ("Staging-first
-# workflow, no exceptions"). When a bot opens a PR against `main`,
-# retarget it to `staging` automatically and leave an explanatory
-# comment. Human / CEO-authored PRs (the staging→main promotion
-# PRs, etc.) are left alone — they're the authorised exception
-# to the rule.
+# Mechanical enforcement of SHARED_RULES rule 8 ("Staging-first workflow, no
+# exceptions"). When a bot opens a PR against main, retarget it to staging
+# automatically and leave an explanatory comment. Human CEO-authored PRs (the
+# staging→main promotion PR, etc.) are left alone — they're the authorised
+# exception to the rule.
 #
-# ============================================================
-# What this workflow does
-# ============================================================
-#
-# On `pull_request_target` opened/reopened against `main`:
-#   1. If the PR head is `staging`, skip (the auto-promote PRs
-#      MUST stay base=main).
-#   2. If the PR author is a bot, retarget the PR base to
-#      `staging` via Gitea REST `PATCH /pulls/{N}` body
-#      `{"base":"staging"}`.
-#   3. If the retarget returns 422 "pull request already exists
-#      for base branch 'staging'" (issue #1884 case: another PR
-#      on the same head already targets staging), close the
-#      now-redundant main-PR via Gitea REST instead of failing
-#      red.
-#   4. Post an explainer comment on the retargeted PR via
-#      Gitea REST `POST /issues/{N}/comments`.
-#
-# ============================================================
-# Why Gitea REST (and not `gh api / gh pr close / gh pr comment`)
-# ============================================================
-#
-# Pre-2026-05-06 this workflow used `gh api -X PATCH "repos/{owner}/{repo}/pulls/{N}" -f base=staging`
-# plus `gh pr close` and `gh pr comment`. After the GitHub→Gitea
-# cutover those calls fail because:
-#
-#   - `gh` CLI defaults to `api.github.com`. Even with `GH_HOST`
-#     pointing at Gitea, `gh pr close / comment` route through
-#     GraphQL (`/api/graphql`) which Gitea does not expose.
-#     Empirical: every `gh pr *` call returns
-#     `HTTP 405 Method Not Allowed (https://git.moleculesai.app/api/graphql)`
-#     — same root cause as #65 (auto-sync, fixed in PR #66) and
-#     #73/#195 (auto-promote, fixed in PR #78).
-#   - `gh api -X PATCH /pulls/{N}` happens to use a REST path
-#     that Gitea also has, but the `gh` host-resolution layer
-#     and pagination/retry logic don't always hit Gitea cleanly,
-#     and the cost of switching to direct `curl` is one extra
-#     line of code.
-#
-# So this workflow uses direct `curl` calls to Gitea REST. No
-# `gh` CLI dependency, no GraphQL, no flaky host-resolution.
-#
-# ============================================================
-# Identity + token (anti-bot-ring per saved-memory
-# `feedback_per_agent_gitea_identity_default`)
-# ============================================================
-#
-# Pre-fix this workflow used the per-job ephemeral
-# `secrets.GITHUB_TOKEN`. On Gitea Actions that token has
-# narrow scope and unpredictable cross-PR write capability.
-#
-# Post-fix: `secrets.AUTO_SYNC_TOKEN` (the `devops-engineer`
-# Gitea persona). Same persona used by `auto-sync-main-to-staging.yml`
-# (PR #66) and `auto-promote-staging.yml` (PR #78). Token scope:
-# `push: true` repo write, sufficient for PR-edit + close + comment.
-#
-# Why this token does NOT need branch-protection bypass:
-# patching a PR's base ref is a PR-level operation that does not
-# require push perms on either branch (the PR's own commits stay
-# put; only the metadata changes).
-#
-# ============================================================
-# Failure modes & operational notes
-# ============================================================
-#
-# A — PATCH base→staging returns 422 "pull request already exists"
-#     (issue #1884 case):
-#     - Detected by string-match on response body. Workflow
-#       falls through to closing the now-redundant main-PR
-#       (Gitea REST `PATCH /pulls/{N}` with `state: closed`)
-#       and posts an explanation comment. Step summary surfaces.
-#
-# B — `AUTO_SYNC_TOKEN` rotated / wrong scope:
-#     - First REST call returns 401/403. Step summary surfaces.
-#       Re-issue token from `~/.molecule-ai/personas/` on the
-#       operator host and update repo Actions secret.
-#
-# C — PR was deleted between trigger and run:
-#     - REST call returns 404. Workflow exits 0 with a notice
-#       (the rule was already enforced or the PR is gone).
-#
-# D — author is not actually a bot but the filter mis-fires:
-#     - Filter is conservative: only triggers on
-#       `user.type == 'Bot'`, `login` ends with `[bot]`, or
-#       known bot logins (`molecule-ai[bot]`, `app/molecule-ai`).
-#       Human PRs slip through unaffected. If a NEW bot login
-#       starts shipping main-PRs, add it to the filter.
+# Why an Action instead of only a prompt rule: prompt rules depend on every
+# role's system-prompt.md staying in sync. Today 5 of 8 engineer roles
+# (core-be, core-fe, app-fe, app-qa, devops-engineer) don't have the
+# staging-first section — the bot keeps opening PRs to main. An Action
+# enforces the invariant regardless of prompt drift.

 on:
  pull_request_target:
@@ -107,16 +24,16 @@ jobs:
  retarget:
    name: Retarget to staging
    runs-on: ubuntu-latest
-    # Only fire for bot-authored PRs. Human CEO PRs (staging→main
-    # promotion) are intentional and pass through.
+    # Only fire for bot-authored PRs. Human CEO PRs (staging→main promotion)
+    # are intentional and pass through.
    #
-    # Head-ref guard: never retarget a PR whose head IS `staging`
-    # — those are the auto-promote staging→main PRs (opened by
-    # `devops-engineer` since PR #78 / #195 fix). Retargeting
-    # head=staging onto base=staging fails with HTTP 422 "no new
-    # commits between base 'staging' and head 'staging'", which
-    # would surface as a noisy red workflow run on every
-    # auto-promote (caught 2026-05-03 on the GitHub-era PR #2588).
+    # Head-ref guard: never retarget a PR whose head IS `staging` — those
+    # are the auto-promote staging→main PRs (opened by molecule-ai[bot]
+    # since #2586 switched to an App token, which now passes the bot
+    # filter below). Retargeting head=staging onto base=staging fails
+    # with HTTP 422 "no new commits between base 'staging' and head
+    # 'staging'", which used to surface as a noisy red workflow run on
+    # every auto-promote (caught 2026-05-03 on PR #2588).
    if: >-
      github.event.pull_request.head.ref != 'staging'
      && (
@@ -124,153 +41,65 @@ jobs:
        || endsWith(github.event.pull_request.user.login, '[bot]')
        || github.event.pull_request.user.login == 'app/molecule-ai'
        || github.event.pull_request.user.login == 'molecule-ai[bot]'
-        || github.event.pull_request.user.login == 'devops-engineer'
      )
    steps:
-      - name: Retarget PR base to staging via Gitea REST
+      - name: Retarget PR base to staging
        id: retarget
        env:
-          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
-          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
-          REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
-        # Issue #1884 case: when the bot opens a PR against main
-        # and there's already another PR on the same head branch
-        # targeting staging, Gitea's PATCH returns 422 with a
-        # body mentioning "pull request already exists for base
-        # branch 'staging'" (the Gitea message wording is
-        # slightly different from GitHub's; the substring match
-        # below covers both for forward/back compat).
-        # The retarget can't proceed — but the right response is
-        # to close the now-redundant main-PR, not to fail the
-        # workflow noisily. Detect that specific 422 and close
-        # instead.
+        # Issue #1884: when the bot opens a PR against main and there's
+        # already another PR on the same head branch targeting staging,
+        # GitHub's PATCH /pulls returns 422 with
+        # "A pull request already exists for base branch 'staging' …".
+        # The retarget can't proceed — but the right response is to
+        # close the now-redundant main-PR, not to fail the workflow
+        # noisily. Detect that specific 422 and close instead.
        run: |
-          set -euo pipefail
-
-          API="${GITEA_HOST}/api/v1/repos/${REPO}"
-          AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
-
-          echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging"
-
-          # Curl-status-capture pattern per `feedback_curl_status_capture_pollution`:
-          # http_code via -w to its own scalar, body to a tempfile, set +e/-e
-          # bracket so curl's non-zero-on-4xx doesn't pollute the script's exit chain.
-          BODY_FILE=$(mktemp)
-          REQ='{"base":"staging"}'
-
          set +e
-          STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
-            -X PATCH -d "${REQ}" \
-            -o "${BODY_FILE}" -w "%{http_code}" \
-            "${API}/pulls/${PR_NUMBER}")
-          CURL_RC=$?
+          echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging"
+          PATCH_OUTPUT=$(gh api -X PATCH \
+            "repos/${{ github.repository }}/pulls/${PR_NUMBER}" \
+            -f base=staging \
+            --jq '.base.ref' 2>&1)
+          PATCH_EXIT=$?
          set -e
-
-          if [ "${CURL_RC}" -ne 0 ]; then
-            echo "::error::curl PATCH failed (rc=${CURL_RC})"
-            rm -f "${BODY_FILE}"
-            exit 1
+          if [ "$PATCH_EXIT" -eq 0 ]; then
+            echo "::notice::Retargeted PR #${PR_NUMBER} → staging"
+            echo "outcome=retargeted" >> "$GITHUB_OUTPUT"
+            exit 0
          fi
-
-          if [ "${STATUS}" = "201" ] || [ "${STATUS}" = "200" ]; then
-            NEW_BASE=$(jq -r '.base.ref // "?"' < "${BODY_FILE}")
-            rm -f "${BODY_FILE}"
-            if [ "${NEW_BASE}" = "staging" ]; then
-              echo "::notice::Retargeted PR #${PR_NUMBER} → staging"
-              echo "outcome=retargeted" >> "$GITHUB_OUTPUT"
-              exit 0
-            fi
-            echo "::error::PATCH returned ${STATUS} but base.ref is '${NEW_BASE}', not 'staging'"
-            exit 1
-          fi
-
          # Specifically match the 422 duplicate-base/head error so
          # any OTHER PATCH failure (auth, deleted PR, etc.) still
          # surfaces as a real workflow failure.
-          BODY=$(cat "${BODY_FILE}" || true)
-          rm -f "${BODY_FILE}"
-
-          if [ "${STATUS}" = "422" ] && echo "${BODY}" | grep -qE "(pull request already exists for base branch 'staging'|already exists.*base.*staging)"; then
+          if echo "$PATCH_OUTPUT" | grep -q "pull request already exists for base branch 'staging'"; then
            echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant."
-
-            # Close the now-redundant main-PR via Gitea REST
-            # (PATCH state=closed). Post comment explaining
-            # rationale BEFORE close so the comment lands on the
-            # PR (commenting on a closed PR works on Gitea, but
-            # historically caused notification ordering surprises).
-
-            CLOSE_BODY_FILE=$(mktemp)
-            CMT_REQ=$(jq -n '{body:"[retarget-bot] Closing — another PR on the same head branch already targets `staging`. This PR is redundant. See issue #1884 for the rationale."}')
-            set +e
-            CMT_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
-              -X POST -d "${CMT_REQ}" \
-              -o "${CLOSE_BODY_FILE}" -w "%{http_code}" \
-              "${API}/issues/${PR_NUMBER}/comments")
-            set -e
-            if [ "${CMT_STATUS}" != "201" ]; then
-              echo "::warning::dup-close comment POST returned ${CMT_STATUS}; continuing to close anyway"
-              cat "${CLOSE_BODY_FILE}" | head -c 300 || true
-            fi
-            rm -f "${CLOSE_BODY_FILE}"
-
-            CLOSE_REQ='{"state":"closed"}'
-            CLOSE_RESP=$(mktemp)
-            set +e
-            CL_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
-              -X PATCH -d "${CLOSE_REQ}" \
-              -o "${CLOSE_RESP}" -w "%{http_code}" \
-              "${API}/pulls/${PR_NUMBER}")
-            set -e
-            if [ "${CL_STATUS}" = "201" ] || [ "${CL_STATUS}" = "200" ]; then
-              echo "::notice::Closed PR #${PR_NUMBER} as redundant"
-              echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT"
-              rm -f "${CLOSE_RESP}"
-              exit 0
-            fi
-            echo "::error::Failed to close redundant PR: HTTP ${CL_STATUS}"
-            cat "${CLOSE_RESP}" | head -c 300 || true
-            rm -f "${CLOSE_RESP}"
-            exit 1
+            gh pr close "$PR_NUMBER" \
+              --repo "${{ github.repository }}" \
+              --comment "[retarget-bot] Closing — another PR on the same head branch already targets \`staging\`. This PR is redundant. See issue #1884 for the rationale."
+            echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT"
+            exit 0
          fi
-
-          echo "::error::Retarget PATCH failed and was NOT a duplicate-base error: HTTP ${STATUS}"
-          echo "${BODY}" | head -c 500 >&2
+          echo "::error::Retarget PATCH failed and was NOT a duplicate-base error:"
+          echo "$PATCH_OUTPUT" >&2
          exit 1

      - name: Post explainer comment
        if: steps.retarget.outputs.outcome == 'retargeted'
        env:
-          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
-          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
-          REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
-          set -euo pipefail
+          gh pr comment "$PR_NUMBER" \
+            --repo "${{ github.repository }}" \
+            --body "$(cat <<'BODY'
+          [retarget-bot] This PR was opened against `main` and has been retargeted to `staging` automatically.

-          API="${GITEA_HOST}/api/v1/repos/${REPO}"
-          AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
+          **Why:** per [SHARED_RULES rule 8](https://github.com/Molecule-AI/molecule-ai-org-template-molecule-dev/blob/main/SHARED_RULES.md), all feature work targets `staging` first; the CEO promotes `staging → main` separately.

-          # PR comments live on the issue endpoint in Gitea
-          # (PRs ARE issues — same endpoint, different sub-resources
-          # for diffs/files/etc.). The body uses jq to safely
-          # encode the multi-line markdown without shell-quote
-          # nightmares.
-          REQ=$(jq -n '{body:"[retarget-bot] This PR was opened against `main` and has been retargeted to `staging` automatically.\n\n**Why:** per [SHARED_RULES rule 8](https://git.moleculesai.app/molecule-ai/molecule-ai-org-template-molecule-dev/src/branch/main/SHARED_RULES.md), all feature work targets `staging` first; the CEO promotes `staging → main` separately.\n\n**What changed:** just the base branch — no code change. CI will re-run against `staging`. If you get merge conflicts, rebase on `staging`.\n\n**If this PR is the CEO`s staging→main promotion:** the Action skipped you (only bot-authored PRs are retargeted, head=staging is also exempted). If you see this comment on your CEO PR, that`s a bug — please tag @hongmingwang."}')
+          **What changed:** just the base branch — no code change. CI will re-run against `staging`. If you get merge conflicts, rebase on `staging`.

-          BODY_FILE=$(mktemp)
-          set +e
-          STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
-            -X POST -d "${REQ}" \
-            -o "${BODY_FILE}" -w "%{http_code}" \
-            "${API}/issues/${PR_NUMBER}/comments")
-          set -e
-
-          if [ "${STATUS}" = "201" ]; then
-            echo "::notice::Posted explainer comment on PR #${PR_NUMBER}"
-          else
-            echo "::warning::Failed to post explainer (HTTP ${STATUS}) — retarget itself succeeded"
-            cat "${BODY_FILE}" | head -c 300 || true
-          fi
-          rm -f "${BODY_FILE}"
+          **If this PR is the CEO's staging→main promotion:** the Action skipped you (only bot-authored PRs are retargeted). If you see this comment on your CEO PR, that's a bug — please tag @HongmingWang-Rabbit.
+          BODY
+          )"
@@ -12,7 +12,7 @@ name: Secret scan
 #
 #   jobs:
 #     secret-scan:
-#       uses: molecule-ai/molecule-core/.github/workflows/secret-scan.yml@staging
+#       uses: Molecule-AI/molecule-core/.github/workflows/secret-scan.yml@staging
 #
 # Pin to @staging not @main — staging is the active default branch,
 # main lags via the staging-promotion workflow. Updates ride along
@@ -131,13 +131,6 @@ backups/
 # Cloned by publish-workspace-server-image.yml so the Dockerfile's
 # replace-directive path resolves. Lives in its own repo.
 /molecule-ai-plugin-github-app-auth/
-# Tenant-image build context — populated by the workflow's
-# "Pre-clone manifest deps" step. Mirrors the public manifest, holds the
-# same content as the three /<>/ dirs above but namespaced under one
-# parent so the Docker build context is a single COPY-friendly tree.
-# Each entry is a transient working-dir, never source-of-truth, never
-# committed.
-/.tenant-bundle-deps/

 # Internal-flavored content lives in Molecule-AI/internal — NEVER in this
 # public monorepo. Migrated 2026-04-23 (CEO directive). The CI workflow
@@ -22,7 +22,7 @@ development workflow, conventions, and how to get your changes merged.

 ```bash
 # Clone the repo
-git clone https://github.com/Molecule-AI/molecule-core.git
+git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
 cd molecule-core

 # Install git hooks
@@ -57,7 +57,7 @@ See `CLAUDE.md` for a full list of environment variables and their purposes.

 This repo is scoped to **code** (canvas, workspace, workspace-server, related
 infra). Public content (blog posts, marketing copy, OG images, SEO briefs,
-DevRel demos) lives in [`Molecule-AI/docs`](https://github.com/Molecule-AI/docs).
+DevRel demos) lives in [`Molecule-AI/docs`](https://git.moleculesai.app/molecule-ai/docs).
 The `Block forbidden paths` CI gate fails any PR that writes to `marketing/`
 or other removed paths — open against `Molecule-AI/docs` instead.

@@ -110,7 +110,7 @@ causing a render loop when any node position changed.

 1. **Repo-wide:** "Automatically delete head branches" is on. Once a PR merges, the branch is deleted server-side. Any subsequent `git push` to that branch fails with `remote rejected — no such branch`.

-2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://github.com/Molecule-AI/molecule-ci/blob/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit.
+2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://git.moleculesai.app/molecule-ai/molecule-ci/src/branch/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit.

 **Workflow rules that follow from the guards:**
 - Push **all** commits before running `gh pr merge --auto`.
@@ -180,9 +180,9 @@ and run CI manually.
 Code in this repo lands in molecule-core. Some related runtime artifacts
 live in their own repos:

- [`Molecule-AI/molecule-ai-workspace-runtime`](https://github.com/Molecule-AI/molecule-ai-workspace-runtime) — Python adapter SDK (`molecule_runtime`) that runs inside containerized Molecule workspaces. Bridges Claude Code SDK / hermes / langgraph / etc. → A2A queue.
- [`Molecule-AI/molecule-sdk-python`](https://github.com/Molecule-AI/molecule-sdk-python) — `A2AServer` + `RemoteAgentClient` for external agents that register over the public `/registry/register` flow.
- [`Molecule-AI/molecule-mcp-claude-channel`](https://github.com/Molecule-AI/molecule-mcp-claude-channel) — Claude Code channel plugin. Bridges A2A traffic into a running Claude Code session via MCP `notifications/claude/channel`. Polling-based (no tunnel required); install with `claude --channels plugin:molecule@Molecule-AI/molecule-mcp-claude-channel`.
+- [`Molecule-AI/molecule-ai-workspace-runtime`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime) — Python adapter SDK (`molecule_runtime`) that runs inside containerized Molecule workspaces. Bridges Claude Code SDK / hermes / langgraph / etc. → A2A queue.
+- [`Molecule-AI/molecule-sdk-python`](https://git.moleculesai.app/molecule-ai/molecule-sdk-python) — `A2AServer` + `RemoteAgentClient` for external agents that register over the public `/registry/register` flow.
+- [`Molecule-AI/molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel) — Claude Code channel plugin. Bridges A2A traffic into a running Claude Code session via MCP `notifications/claude/channel`. Polling-based (no tunnel required); install with `claude --channels plugin:molecule@Molecule-AI/molecule-mcp-claude-channel`.

 When extending the **A2A surface** in molecule-core (`workspace-server/internal/handlers/a2a_proxy.go` etc.), consider whether the change has a downstream impact on the runtime SDK or the channel plugin — they're versioned independently but share the wire shape.

@@ -1,7 +1,7 @@
 <div align="center">

 <p>
-  <img src="./docs/assets/branding/molecule-icon.png" alt="Molecule AI Icon Logo" width="160" />
+  <img src="./docs/assets/branding/molecule-icon.svg" alt="Molecule AI" width="160" />
 </p>

 <p>
@@ -39,8 +39,8 @@
  <a href="./docs/agent-runtime/workspace-runtime.md"><strong>Workspace Runtime</strong></a>
 </p>

-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-monorepo)
-[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-monorepo)
+[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://git.moleculesai.app/molecule-ai/molecule-core)
+[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://git.moleculesai.app/molecule-ai/molecule-core)

 </div>

@@ -53,8 +53,8 @@ Molecule AI is the most powerful way to govern an AI agent organization in produ
 It combines the parts that are usually scattered across demos, internal glue code, and framework-specific tooling into one product:

 - one org-native control plane for teams, roles, hierarchy, and lifecycle
- one runtime layer that lets LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw run side by side
- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries
+- one runtime layer that lets **eight** agent runtimes — LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, **Hermes**, **Gemini CLI**, and OpenClaw — run side by side behind one workspace contract
+- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries (Memory v2 backed by pgvector for semantic recall)
 - one operational surface for observing, pausing, restarting, inspecting, and improving live workspaces

 Most teams can build a workflow, a strong single agent, a coding agent, or a custom multi-agent graph.
@@ -75,7 +75,7 @@ You do not wire collaboration paths by hand. Hierarchy defines the default commu

 ### 3. Runtime choice stops being a dead-end decision

-LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime.
+LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, Hermes, Gemini CLI, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime.

 ### 4. Memory is treated like infrastructure

@@ -117,6 +117,8 @@ Molecule AI is not trying to replace the frameworks below. It is the system that
 | **Claude Code** | Shipping on `main` | Real coding workflows, CLI-native continuity | Secure workspace abstraction, A2A delegation, org boundaries, shared control plane |
 | **CrewAI** | Shipping on `main` | Role-based crews | Persistent workspace identity, policy consistency, shared canvas and registry |
 | **AutoGen** | Shipping on `main` | Assistant/tool orchestration | Standardized deployment, hierarchy-aware collaboration, shared ops plane |
+| **Hermes 4** | Shipping on `main` | Hybrid reasoning, native tools, json_schema (NousResearch/hermes-agent) | Option B upstream hook, A2A bridge to OpenAI-compat API, multi-provider provider derivation |
+| **Gemini CLI** | Shipping on `main` | Google Gemini CLI continuity | Workspace lifecycle, A2A, hierarchy-aware collaboration, shared ops plane |
 | **OpenClaw** | Shipping on `main` | CLI-native runtime with its own session model | Workspace lifecycle, templates, activity logs, topology-aware collaboration |
 | **NemoClaw** | WIP on `feat/nemoclaw-t4-docker` | NVIDIA-oriented runtime path | Planned to join the same abstraction once merged; not yet part of `main` |

@@ -182,9 +184,10 @@ The result is not just “an agent that learns.” It is **an organization that

 ## What Ships In `main`

-### Canvas
+### Canvas (v4)

 - Next.js 15 + React Flow + Zustand
+- **warm-paper theme system** — light / dark / follow-system, SSR cookie + nonce'd boot script + ThemeProvider; terminal + code surfaces stay dark unconditionally
 - drag-to-nest team building
 - empty-state deployment + onboarding wizard
 - template palette
@@ -193,8 +196,9 @@ The result is not just “an agent that learns.” It is **an organization that

 ### Platform

- Go/Gin control plane
- workspace CRUD and provisioning
+- Go 1.25 / Gin control plane (80+ HTTP endpoints + Gorilla WebSocket fanout)
+- workspace CRUD and provisioning (pluggable Provisioner — Docker locally, EC2 + SSM in production)
+- **A2A response path is a typed discriminated union (RFC #2967)** — frozen dataclasses + total parser; 100% unit + adversarial fuzz coverage
 - registry and heartbeats
 - browser-safe A2A proxy
 - team expansion/collapse
@@ -204,10 +208,10 @@ The result is not just “an agent that learns.” It is **an organization that

 ### Runtime

- unified `workspace/` image
- adapter-driven execution
+- unified `workspace/` image; thin AMI in production (us-east-2)
+- adapter-driven execution across **8 runtimes** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw)
 - Agent Card registration
- awareness-backed memory integration
+- awareness-backed memory integration; **Memory v2 backed by pgvector** for semantic recall
 - plugin-mounted shared rules/skills
 - hot-reloadable local skills
 - coordinator-only delegation path
@@ -221,6 +225,21 @@ The result is not just “an agent that learns.” It is **an organization that
 - runtime tiers
 - direct workspace inspection through terminal and files

+### SaaS (via [`molecule-controlplane`](https://github.com/Molecule-AI/molecule-controlplane))
+
+- multi-tenant on AWS EC2 + Neon (per-tenant Postgres branch) + Cloudflare Tunnels (per-tenant, no public ports)
+- WorkOS AuthKit + Stripe Checkout + Customer Portal
+- AWS KMS envelope encryption (DB / Redis connection strings); AWS Secrets Manager for tenant bootstrap
+- `tenant_resources` audit table + 30-min boot-event-aware reconciler — every CF / AWS lifecycle event recorded, claim vs live state diffed
+
+### Bring your own Claude Code session (via [`molecule-mcp-claude-channel`](https://github.com/Molecule-AI/molecule-mcp-claude-channel))
+
+- Claude Code plugin that bridges Molecule A2A traffic into a local Claude Code session via MCP
+- subscribe to one or more workspaces; peer messages surface as conversation turns; replies route back through Molecule's A2A
+- no tunnel, no public endpoint — the plugin self-registers each watched workspace as `delivery_mode=poll` and long-polls `/activity?since_id=…`
+- multi-tenant friendly: one plugin install can watch workspaces across multiple Molecule tenants (`MOLECULE_PLATFORM_URLS` per-workspace)
+- install via the standard marketplace flow: `/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel`
+
 ## Built For Teams That Need More Than A Demo

 Molecule AI is especially strong when you need to run:
@@ -233,24 +252,30 @@ Molecule AI is especially strong when you need to run:
 ## Architecture

 ```text
-Canvas (Next.js :3000)  <--HTTP / WS-->  Platform (Go :8080)  <---> Postgres + Redis
-         |                                          |
-         |                                          +--> Docker provisioner / bundles / templates / secrets
+Canvas (Next.js 15, warm-paper :3000)  <--HTTP / WS-->  Platform (Go 1.25 :8080)  <---> Postgres + Redis
+         |                                                           |
+         |                                                           +--> Provisioner: Docker (local) / EC2 + SSM (prod)
+         |                                                           +--> bundles · templates · secrets · KMS
         |
-         +-------------------- shows --------------------> workspaces, teams, tasks, traces, events
+         +------------------------- shows ------------------------> workspaces, teams, tasks, traces, events

-Workspace Runtime (Python image with adapters)
-  - LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw
-  - Agent Card + A2A server
-  - heartbeat + activity + awareness-backed memory
+Workspace Runtime (Python ≥3.11, image with adapters)
+  - 8 adapters: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw
+  - Agent Card + A2A server (typed-SSOT response path, RFC #2967)
+  - heartbeat + activity + awareness-backed memory (Memory v2 — pgvector semantic recall)
  - skills + plugins + hot reload
+
+SaaS Control Plane (molecule-controlplane, private)
+  - per-tenant EC2 + Neon (Postgres branch) + Cloudflare Tunnel
+  - WorkOS · Stripe · KMS · AWS Secrets Manager
+  - tenant_resources audit + 30-min reconciler
 ```

 ## Quick Start

 ```bash
-git clone https://github.com/Molecule-AI/molecule-monorepo.git
-cd molecule-monorepo
+git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
+cd molecule-core

 cp .env.example .env
 # Defaults boot the stack locally out of the box. See .env.example for
@@ -303,7 +328,11 @@ Then open `http://localhost:3000`:

 ## Current Scope

-The current `main` branch already includes the core platform, canvas, memory model, six production adapters, skill lifecycle, and operational surfaces. Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose.
+The current `main` branch ships the core platform, Canvas v4 (warm-paper themed), Memory v2 (pgvector semantic recall), the typed-SSOT A2A response path (RFC #2967), **eight production adapters** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw), skill lifecycle, and operational surfaces.
+
+The companion private repo [`molecule-controlplane`](https://github.com/Molecule-AI/molecule-controlplane) provides the SaaS surface — multi-tenant orchestration on EC2 + Neon + Cloudflare Tunnels, KMS envelope encryption, WorkOS auth, Stripe billing, and a `tenant_resources` audit table with a 30-min reconciler.
+
+Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose.

 ## License

@@ -1,7 +1,7 @@
 <div align="center">

 <p>
-  <img src="./docs/assets/branding/molecule-icon.png" alt="Molecule AI 图案 Logo" width="160" />
+  <img src="./docs/assets/branding/molecule-icon.svg" alt="Molecule AI" width="160" />
 </p>

 <p>
@@ -38,8 +38,8 @@
  <a href="./docs/agent-runtime/workspace-runtime.md"><strong>Workspace Runtime</strong></a>
 </p>

-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-core)
-[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-core)
+[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://git.moleculesai.app/molecule-ai/molecule-core)
+[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://git.moleculesai.app/molecule-ai/molecule-core)

 </div>

@@ -52,8 +52,8 @@ Molecule AI 是目前最强的 AI Agent 组织治理方案之一，用来把 age
 它把过去分散在 demo、内部胶水代码和各类 framework 私有工具里的关键能力，收敛成一个产品：

 - 一套组织原生 control plane，管理团队、角色、层级和生命周期
- 一套 runtime abstraction，让 LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 并存运行
- 一套与组织边界对齐的 memory 模型，把 recall、sharing 和 skill evolution 放进同一体系
+- 一套 runtime abstraction，让 **8 个** agent runtime —— LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、**Hermes**、**Gemini CLI**、OpenClaw —— 共用一套 workspace 契约
+- 一套与组织边界对齐的 memory 模型，把 recall、sharing 和 skill evolution 放进同一体系（Memory v2 由 pgvector 支撑语义召回）
 - 一套面向线上 workspace 的运维面，统一完成观测、暂停、重启、检查和持续改进

 今天很多团队能做好 workflow、单 agent、coding agent，或者自定义 multi-agent graph 中的一种。
@@ -74,7 +74,7 @@ Molecule AI 填的就是这个空白。

 ### 3. Runtime 选择不再是死路

-LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式，而不必统一到底层 runtime。
+LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、Hermes、Gemini CLI、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式，而不必统一到底层 runtime。

 ### 4. Memory 被当成基础设施来做

@@ -116,6 +116,8 @@ Molecule AI 并不是要替代下面这些 framework，而是把它们纳入更
 | **Claude Code** | `main` 已支持 | 真实编码工作流、CLI-native continuity | 安全 workspace 抽象、A2A delegation、组织边界、共享 control plane |
 | **CrewAI** | `main` 已支持 | 角色型 crew 模式清晰 | 持久 workspace 身份、统一策略、共享 Canvas 和 registry |
 | **AutoGen** | `main` 已支持 | assistant/tool orchestration | 统一部署、层级协作、共享运维平面 |
+| **Hermes 4** | `main` 已支持 | 混合推理、原生工具调用、json_schema 输出（NousResearch/hermes-agent） | Option B 上游 hook、A2A 桥接 OpenAI 兼容 API、多 provider 自动派生 |
+| **Gemini CLI** | `main` 已支持 | Google Gemini CLI 持续会话 | workspace 生命周期、A2A、层级感知协作、共享运维平面 |
 | **OpenClaw** | `main` 已支持 | CLI-native runtime，自有 session 模型 | workspace 生命周期、templates、activity logs、拓扑感知协作 |
 | **NemoClaw** | `feat/nemoclaw-t4-docker` 分支 WIP | NVIDIA 方向 runtime 路线 | 计划并入同一抽象层，但当前还不是 `main` 已合并能力 |

@@ -181,9 +183,10 @@ Molecule AI 并不是要替代下面这些 framework，而是把它们纳入更

 ## `main` 分支已经具备什么

-### Canvas
+### Canvas（v4）

 - Next.js 15 + React Flow + Zustand
+- **warm-paper 主题系统** —— light / dark / 跟随系统；SSR cookie + nonce'd boot 脚本 + ThemeProvider；终端与代码面板始终保持深色
 - drag-to-nest 团队构建
 - empty state + onboarding wizard
 - template palette
@@ -192,8 +195,9 @@ Molecule AI 并不是要替代下面这些 framework，而是把它们纳入更

 ### Platform

- Go/Gin control plane
- workspace CRUD 和 provisioning
+- Go 1.25 / Gin control plane（80+ HTTP 端点 + Gorilla WebSocket fanout）
+- workspace CRUD 和 provisioning（可插拔 Provisioner —— 本地 Docker、生产 EC2 + SSM）
+- **A2A 响应路径已收敛为类型化的判别联合（RFC #2967）** —— 冻结 dataclass + 全量 parser；100% 单元测试 + 对抗性 fuzz 覆盖
 - registry 与 heartbeat
 - 浏览器安全的 A2A proxy
 - team expansion/collapse
@@ -203,10 +207,10 @@ Molecule AI 并不是要替代下面这些 framework，而是把它们纳入更

 ### Runtime

- 统一 `workspace/` 镜像
- adapter 驱动执行
+- 统一 `workspace/` 镜像；生产环境采用 thin AMI（us-east-2）
+- adapter 驱动执行，覆盖 **8 个 runtime**（Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw）
 - Agent Card 注册
- awareness-backed memory
+- awareness-backed memory；**Memory v2 由 pgvector 支撑**语义召回
 - plugin 挂载共享 rules/skills
 - 本地 skills 热加载
 - coordinator-only delegation 路径
@@ -220,6 +224,21 @@ Molecule AI 并不是要替代下面这些 framework，而是把它们纳入更
 - runtime tiers
 - 终端与文件层面的 workspace 直接排障

+### SaaS（由 [`molecule-controlplane`](https://github.com/Molecule-AI/molecule-controlplane) 提供）
+
+- 多租户运行在 AWS EC2 + Neon（每租户一个 Postgres branch）+ Cloudflare Tunnels（每租户一条隧道，对外不开任何端口）
+- WorkOS AuthKit + Stripe Checkout + Customer Portal
+- AWS KMS 信封加密（DB / Redis 连接串）；AWS Secrets Manager 负责租户 bootstrap
+- `tenant_resources` 审计表 + 30 分钟 boot-event-aware reconciler —— 每个 CF / AWS lifecycle 事件都有记录，每 30 分钟比对 claim 与实际状态
+
+### 在 Claude Code 里直接接入（由 [`molecule-mcp-claude-channel`](https://github.com/Molecule-AI/molecule-mcp-claude-channel) 提供）
+
+- 把 Molecule A2A 流量桥接到本地 Claude Code 会话的 MCP 插件
+- 订阅一个或多个 workspace；peer 的消息会以 user-turn 出现，回复会经 Molecule A2A 路由出去
+- 无需公网隧道、无需公开端点 —— 插件启动时自动把每个 watched workspace 注册成 `delivery_mode=poll`，长轮询 `/activity?since_id=…`
+- 多租户友好：单次安装即可同时 watch 跨多个 Molecule 租户的 workspace（`MOLECULE_PLATFORM_URLS` 按 workspace 配置）
+- 通过标准 marketplace 流程安装：`/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel`
+
 ## 适合什么团队

 Molecule AI 特别适合下面这些场景：
@@ -232,23 +251,29 @@ Molecule AI 特别适合下面这些场景：
 ## 架构总览

 ```text
-Canvas (Next.js :3000)  <--HTTP / WS-->  Platform (Go :8080)  <---> Postgres + Redis
-         |                                          |
-         |                                          +--> Docker provisioner / bundles / templates / secrets
+Canvas (Next.js 15, warm-paper :3000)  <--HTTP / WS-->  Platform (Go 1.25 :8080)  <---> Postgres + Redis
+         |                                                           |
+         |                                                           +--> Provisioner: Docker (本地) / EC2 + SSM (生产)
+         |                                                           +--> bundles · templates · secrets · KMS
         |
-         +-------------------- 展示 --------------------> workspaces, teams, tasks, traces, events
+         +------------------------- 展示 ------------------------> workspaces, teams, tasks, traces, events

-Workspace Runtime (Python image with adapters)
-  - LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw
-  - Agent Card + A2A server
-  - heartbeat + activity + awareness-backed memory
+Workspace Runtime (Python ≥3.11，含 adapter 集合的镜像)
+  - 8 个 adapter: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw
+  - Agent Card + A2A server（typed-SSOT 响应路径，RFC #2967）
+  - heartbeat + activity + awareness-backed memory（Memory v2 —— pgvector 语义召回）
  - skills + plugins + hot reload
+
+SaaS Control Plane (molecule-controlplane，私有)
+  - 每租户 EC2 + Neon (Postgres branch) + Cloudflare Tunnel
+  - WorkOS · Stripe · KMS · AWS Secrets Manager
+  - tenant_resources 审计 + 30 分钟 reconciler
 ```

 ## 快速开始

 ```bash
-git clone https://github.com/Molecule-AI/molecule-core.git
+git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
 cd molecule-core

 cp .env.example .env
@@ -296,7 +321,11 @@ npm run dev

 ## 当前范围说明

-当前 `main` 已经包含核心平台、Canvas、memory model、6 个正式 adapter、skill lifecycle 和主要运维面。像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作，只有合并后才会进入正式支持列表，这里会明确区分。
+当前 `main` 已经包含核心平台、Canvas v4（warm-paper 主题）、Memory v2（pgvector 语义召回）、typed-SSOT A2A 响应路径（RFC #2967）、**8 个正式 adapter**（Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw）、skill lifecycle，以及主要运维面。
+
+配套的私有仓库 [`molecule-controlplane`](https://github.com/Molecule-AI/molecule-controlplane) 提供 SaaS 层 —— 多租户编排（EC2 + Neon + Cloudflare Tunnels）、KMS 信封加密、WorkOS 鉴权、Stripe 计费，以及 `tenant_resources` 审计表加 30 分钟 reconciler。
+
+像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作，只有合并后才会进入正式支持列表，这里会明确区分。

 ## License

@@ -3,7 +3,6 @@ import { cookies, headers } from "next/headers";
 import "./globals.css";
 import { AuthGate } from "@/components/AuthGate";
 import { CookieConsent } from "@/components/CookieConsent";
-import { PurchaseSuccessModal } from "@/components/PurchaseSuccessModal";
 import { ThemeProvider } from "@/lib/theme-provider";
 import {
  THEME_COOKIE,
@@ -87,12 +86,6 @@ export default async function RootLayout({
              vercel preview URL, apex) pass through unchanged. */}
          <AuthGate>{children}</AuthGate>
          <CookieConsent />
-          {/* Demo Mock #1: post-purchase success toast. Mounted at the
-              layout level so it persists across page state transitions
-              (loading → hydrated → error) without being unmounted and
-              losing its open-state. Reads ?purchase_success=1 from the
-              URL on first paint, then strips the param. */}
-          <PurchaseSuccessModal />
        </ThemeProvider>
      </body>
    </html>
@@ -1,10 +1,9 @@
 'use client';

-import { useEffect, useMemo, useCallback, useRef } from "react";
+import { useEffect, useMemo, useCallback } from "react";
 import { type Edge, MarkerType } from "@xyflow/react";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
-import { useSocketEvent } from "@/hooks/useSocketEvent";
 import type { ActivityEntry } from "@/types/activity";

 // ── Constants ─────────────────────────────────────────────────────────────────
@@ -12,6 +11,9 @@ import type { ActivityEntry } from "@/types/activity";
 /** 60-minute look-back window for delegation activity */
 export const A2A_WINDOW_MS = 60 * 60 * 1000;

+/** Polling interval — refresh edges every 60 seconds */
+export const A2A_POLL_MS = 60 * 1_000;
+
 /** Threshold for "hot" edges: < 5 minutes → animated + violet stroke */
 export const A2A_HOT_MS = 5 * 60 * 1_000;

@@ -129,20 +131,6 @@ export function buildA2AEdges(
 * `a2aEdges`. Canvas.tsx merges these with topology edges and passes the
 * combined list to ReactFlow.
 *
- * Update shape (issue #61 Stage 2, replaces the 60s polling loop):
- *  - On mount (when showA2AEdges): one HTTP fan-out per visible workspace
- *    (delegation rows, 60-min window). Bootstraps the local row buffer.
- *  - Steady state: subscribes to ACTIVITY_LOGGED via useSocketEvent.
- *    Each delegation event from a visible workspace is appended to the
- *    buffer; edges are re-derived via the existing buildA2AEdges helper.
- *  - showA2AEdges toggle off: clears edges + buffer.
- *  - Visible-ID-set change: re-bootstraps so a freshly-shown workspace
- *    backfills its 60-min history (existing visibleIdsKey selector
- *    behaviour preserved — that's the 2026-05-04 render-loop fix).
- *
- * No interval poll. The singleton ReconnectingSocket already owns
- * reconnect / backoff / health-check; useSocketEvent inherits those.
- *
 * Mount this inside CanvasInner (no ReactFlow hook dependency).
 */
 export function A2ATopologyOverlay() {
@@ -169,9 +157,7 @@ export function A2ATopologyOverlay() {
  // the symptom of this re-render storm.
  //
  // The fix is purely the dependency-stability change here; the fetch
-  // logic is unchanged. Post-#61 the polling-driven fetch is gone, but
-  // the visibleIdsKey gate is still required so a peer-discovery write
-  // doesn't trigger a wasteful re-bootstrap.
+  // logic is unchanged.
  const visibleIdsKey = useCanvasStore((s) =>
    s.nodes
      .filter((n) => !n.hidden)
@@ -185,42 +171,16 @@ export function A2ATopologyOverlay() {
    [visibleIdsKey]
  );

-  // Local rolling buffer of delegation rows. Pruned by A2A_WINDOW_MS on
-  // each rebuild so a long-lived session doesn't accumulate unbounded
-  // history. The buffer's high-water mark is approximately:
-  //    visibleIds.length × bootstrap-fetch-limit (500) + WS arrivals
-  // Real-world ceiling: ~3000 entries at the 60-min boundary, all of
-  // which buildA2AEdges aggregates into at most N² edges.
-  const bufferRef = useRef<ActivityEntry[]>([]);
-  // visibleIdsRef gives the WS handler the latest visible-ID set without
-  // re-subscribing on every render. The bus listener is registered
-  // exactly once per mount; subscriber-side filtering reads from this ref.
-  const visibleIdsRef = useRef(visibleIds);
-  visibleIdsRef.current = visibleIds;
-
-  // Re-derive overlay edges from the current buffer + push to store.
-  // Prunes by A2A_WINDOW_MS first so memory stays bounded across long
-  // sessions and the aggregation cost stays O(window-size).
-  const recomputeAndPush = useCallback(() => {
-    const cutoff = Date.now() - A2A_WINDOW_MS;
-    bufferRef.current = bufferRef.current.filter(
-      (r) => new Date(r.created_at).getTime() > cutoff
-    );
-    setA2AEdges(buildA2AEdges(bufferRef.current));
-  }, [setA2AEdges]);
-
-  // Bootstrap fan-out — one HTTP per visible workspace. Replaces the
-  // 60s polling loop entirely. Race-aware: any WS arrivals that landed
-  // in the buffer DURING the fetch (between the await and resume) are
-  // preserved by id-dedup-with-fetched-first ordering.
-  const bootstrap = useCallback(async () => {
+  // Fetch delegation activity for all visible workspaces and rebuild overlay edges.
+  const fetchAndUpdate = useCallback(async () => {
    if (visibleIds.length === 0) {
-      bufferRef.current = [];
      setA2AEdges([]);
      return;
    }
    try {
-      const fetchedRows = (
+      // Fan-out — one request per visible workspace.
+      // Per-request failures are swallowed so one broken workspace doesn't blank the overlay.
+      const allRows = (
        await Promise.all(
          visibleIds.map((id) =>
            api
@@ -232,76 +192,24 @@ export function A2ATopologyOverlay() {
        )
      ).flat();

-      // Merge: fetched rows first, then any in-flight WS arrivals that
-      // accumulated during the await. Dedup by id so rows that appear
-      // in both paths are not double-counted in the aggregation.
-      const merged = [...fetchedRows, ...bufferRef.current];
-      const seen = new Set<string>();
-      bufferRef.current = merged.filter((r) => {
-        if (seen.has(r.id)) return false;
-        seen.add(r.id);
-        return true;
-      });
-      recomputeAndPush();
+      setA2AEdges(buildA2AEdges(allRows));
    } catch {
      // Overlay failure is non-critical — canvas remains functional
    }
-  }, [visibleIds, setA2AEdges, recomputeAndPush]);
+  }, [visibleIds, setA2AEdges]);

  useEffect(() => {
    if (!showA2AEdges) {
-      // Clear edges + buffer immediately when toggled off
-      bufferRef.current = [];
+      // Clear edges immediately when toggled off
      setA2AEdges([]);
      return;
    }
-    void bootstrap();
-  }, [showA2AEdges, bootstrap, setA2AEdges]);

-  // Live-update path. Filters server-side ACTIVITY_LOGGED events down
-  // to delegation initiations from visible workspaces and appends each
-  // into the rolling buffer, re-deriving edges via buildA2AEdges.
-  //
-  // Only `method === "delegate"` rows count — the same filter
-  // buildA2AEdges applies — so delegate_result rows arriving over the
-  // wire don't double-count.
-  useSocketEvent((msg) => {
-    if (!showA2AEdges) return;
-    if (msg.event !== "ACTIVITY_LOGGED") return;
-
-    const p = (msg.payload || {}) as Record<string, unknown>;
-    if (p.activity_type !== "delegation") return;
-    if (p.method !== "delegate") return;
-
-    const wsId = msg.workspace_id;
-    if (!visibleIdsRef.current.includes(wsId)) return;
-
-    // Synthesise an ActivityEntry from the WS payload so buildA2AEdges
-    // (which the bootstrap path also feeds) handles it identically.
-    const entry: ActivityEntry = {
-      id:
-        (p.id as string) ||
-        `ws-push-${msg.timestamp || Date.now()}-${wsId}`,
-      workspace_id: wsId,
-      activity_type: "delegation",
-      source_id: (p.source_id as string | null) ?? null,
-      target_id: (p.target_id as string | null) ?? null,
-      method: "delegate",
-      summary: (p.summary as string | null) ?? null,
-      request_body: null,
-      response_body: null,
-      duration_ms: (p.duration_ms as number | null) ?? null,
-      status: (p.status as string) || "ok",
-      error_detail: null,
-      created_at:
-        (p.created_at as string) ||
-        msg.timestamp ||
-        new Date().toISOString(),
-    };
-
-    bufferRef.current = [...bufferRef.current, entry];
-    recomputeAndPush();
-  });
+    // Initial fetch, then poll every 60 s
+    void fetchAndUpdate();
+    const timer = setInterval(() => void fetchAndUpdate(), A2A_POLL_MS);
+    return () => clearInterval(timer);
+  }, [showA2AEdges, fetchAndUpdate, setA2AEdges]);

  // Pure side-effect — renders nothing
  return null;
@@ -3,7 +3,6 @@
 import { useState, useEffect, useCallback, useRef } from "react";
 import { useCanvasStore } from "@/store/canvas";
 import { api } from "@/lib/api";
-import { useSocketEvent } from "@/hooks/useSocketEvent";
 import { COMM_TYPE_LABELS } from "@/lib/design-tokens";

 interface Communication {
@@ -19,71 +18,32 @@ interface Communication {
  durationMs: number | null;
 }

-/** Workspace-server `ACTIVITY_LOGGED` payload shape. Pulled out so the
- *  WS handler below has a typed view of the same fields the HTTP
- *  bootstrap consumes — drift between the two paths is a class of bug
- *  AgentCommsPanel hit historically. */
-interface ActivityLoggedPayload {
-  id?: string;
-  activity_type?: string;
-  source_id?: string | null;
-  target_id?: string | null;
-  workspace_id?: string;
-  summary?: string | null;
-  status?: string;
-  duration_ms?: number | null;
-  created_at?: string;
-}
-
-/** Fan-out cap for the bootstrap HTTP fetch on mount / on visibility
- *  re-open. Kept at 3 (carried over from the 2026-05-04 fix) so a
- *  freshly-mounted overlay on a 15-workspace tenant only spends 3
- *  round-trips bootstrapping. Live updates after that arrive via the
- *  WS subscription below — no polling, no fan-out to maintain. */
-const BOOTSTRAP_FAN_OUT_CAP = 3;
-
-/** Cap on the rendered list. Bootstrap + every WS push prepends, the
- *  list is sliced to this size after each update. Mirrors the prior
- *  polling-loop behaviour. */
-const COMMS_RENDER_CAP = 20;
-
 /**
 * Overlay showing recent A2A communications between workspaces.
- *
- * Update shape (issue #61 Stage 1, replaces the 30s polling loop):
- *  - On mount (when visible): one HTTP bootstrap per online workspace,
- *    capped at BOOTSTRAP_FAN_OUT_CAP. Yields the initial recent-comms
- *    window without waiting for live events.
- *  - Steady state: subscribes to ACTIVITY_LOGGED via useSocketEvent.
- *    Each event with a matching activity_type from a visible online
- *    workspace gets synthesised into a Communication and prepended.
- *  - Visibility re-open: re-bootstraps so the user sees the freshest
- *    window even if WS was idle while collapsed.
- *
- * No interval poll. The singleton ReconnectingSocket in `store/socket.ts`
- * already owns reconnect/backoff/health-check, and `useSocketEvent`
- * inherits those guarantees. If WS is genuinely unhealthy, the overlay
- * shows the bootstrap snapshot until the next visibility re-open or
- * the next WS reconnect (which fires its own rehydrate burst).
+ * Renders as a floating log panel that auto-updates.
 */
 export function CommunicationOverlay() {
  const [comms, setComms] = useState<Communication[]>([]);
  const [visible, setVisible] = useState(true);
  const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
  const nodes = useCanvasStore((s) => s.nodes);
-  // nodesRef gives the WS handler current node-name resolution without
-  // re-subscribing on every node-list change. The bus listener is
-  // registered exactly once per mount; subscriber-side filtering reads
-  // the latest value via this ref.
  const nodesRef = useRef(nodes);
  nodesRef.current = nodes;

-  const bootstrapComms = useCallback(async () => {
+  const fetchComms = useCallback(async () => {
    try {
+      // Fan-out cap: each polled workspace = 1 round-trip. The platform
+      // rate limits at 600 req/min/IP; combined with heartbeats + other
+      // canvas polling, every workspace polled here costs ~6 req/min
+      // (1 every 30s × 1 per workspace). Capping at 3 keeps this
+      // overlay's footprint at 18 req/min worst case — well under
+      // budget even with 8+ workspaces visible. Caught 2026-05-04 when
+      // a user with 8+ workspaces (Design Director + 6 sub-agents +
+      // 3 standalones) saw sustained 429s in canvas console.
      const onlineNodes = nodesRef.current.filter((n) => n.data.status === "online");
      const allComms: Communication[] = [];

-      for (const node of onlineNodes.slice(0, BOOTSTRAP_FAN_OUT_CAP)) {
+      for (const node of onlineNodes.slice(0, 3)) {
        try {
          const activities = await api.get<Array<{
            id: string;
@@ -99,8 +59,8 @@ export function CommunicationOverlay() {

          for (const a of activities) {
            if (a.activity_type === "a2a_send" || a.activity_type === "a2a_receive") {
-              const sourceNode = nodesRef.current.find((n) => n.id === (a.source_id || a.workspace_id));
-              const targetNode = nodesRef.current.find((n) => n.id === (a.target_id || ""));
+              const sourceNode = nodes.find((n) => n.id === (a.source_id || a.workspace_id));
+              const targetNode = nodes.find((n) => n.id === (a.target_id || ""));
              allComms.push({
                id: a.id,
                sourceId: a.source_id || a.workspace_id,
@@ -116,12 +76,11 @@ export function CommunicationOverlay() {
            }
          }
        } catch {
-          // Per-workspace failures must not blank the panel — the same
-          // robustness the polling version had.
+          // Skip workspaces that fail
        }
      }

-      // Newest-first with id-dedup, capped at COMMS_RENDER_CAP.
+      // Sort by timestamp, newest first, dedupe
      const seen = new Set<string>();
      const sorted = allComms
        .sort((a, b) => b.timestamp.localeCompare(a.timestamp))
@@ -130,78 +89,29 @@ export function CommunicationOverlay() {
          seen.add(c.id);
          return true;
        })
-        .slice(0, COMMS_RENDER_CAP);
+        .slice(0, 20);

      setComms(sorted);
    } catch {
-      // Bootstrap failure is non-blocking — the WS subscription below
-      // will populate the panel as live events arrive.
+      // Silently handle API errors
    }
  }, []);

-  // Bootstrap once on mount + every time the user re-opens after a
-  // collapse. Closed-panel state intentionally drops live updates so
-  // the panel doesn't churn invisible state — the next open reloads.
  useEffect(() => {
+    // Gate polling on visibility — when the user collapses the overlay
+    // the data isn't being read, so the per-workspace fan-out becomes
+    // pure rate-limit overhead. Pre-fix this overlay polled regardless
+    // of whether the panel was shown, costing ~36 req/min from a
+    // hidden surface.
    if (!visible) return;
-    bootstrapComms();
-  }, [bootstrapComms, visible]);
-
-  // Live-update path. Filters server-side ACTIVITY_LOGGED events down
-  // to the comm-overlay-relevant subset and prepends each into the
-  // rendered list with the same dedup the bootstrap path uses.
-  //
-  // Scope guard: ignore events for workspaces not in the visible online
-  // set, so a user collapsing one workspace doesn't see its comms
-  // continue to scroll in. Same shape the bootstrap path applies.
-  useSocketEvent((msg) => {
-    if (!visible) return;
-    if (msg.event !== "ACTIVITY_LOGGED") return;
-
-    const p = (msg.payload || {}) as ActivityLoggedPayload;
-    const type = p.activity_type;
-    if (type !== "a2a_send" && type !== "a2a_receive" && type !== "task_update") return;
-
-    const wsId = msg.workspace_id;
-    const onlineSet = new Set(
-      nodesRef.current.filter((n) => n.data.status === "online").map((n) => n.id),
-    );
-    if (!onlineSet.has(wsId)) return;
-
-    const sourceId = p.source_id || wsId;
-    const targetId = p.target_id || "";
-    const sourceNode = nodesRef.current.find((n) => n.id === sourceId);
-    const targetNode = nodesRef.current.find((n) => n.id === targetId);
-
-    const incoming: Communication = {
-      id: p.id || `${msg.timestamp || Date.now()}:${sourceId}:${targetId}`,
-      sourceId,
-      targetId,
-      sourceName: sourceNode?.data.name || "Unknown",
-      targetName: targetNode?.data.name || "Unknown",
-      type: type as Communication["type"],
-      summary: p.summary || "",
-      status: p.status || "ok",
-      timestamp: p.created_at || msg.timestamp || new Date().toISOString(),
-      durationMs: p.duration_ms ?? null,
-    };
-
-    setComms((prev) => {
-      // Prepend, dedup by id, re-cap. Functional setState is necessary
-      // because two ACTIVITY_LOGGED events arriving in the same React
-      // batch would otherwise read a stale `comms` from the closure.
-      const seen = new Set<string>();
-      const merged = [incoming, ...prev]
-        .sort((a, b) => b.timestamp.localeCompare(a.timestamp))
-        .filter((c) => {
-          if (seen.has(c.id)) return false;
-          seen.add(c.id);
-          return true;
-        })
-        .slice(0, COMMS_RENDER_CAP);
-      return merged;
-    });
-  });
+    fetchComms();
+    // 30s cadence (was 10s). At 3-workspace fan-out that's 6 req/min
+    // worst case from this overlay. Combined with heartbeats (~30/min)
+    // and other canvas polling, leaves ample headroom under the 600/
+    // min/IP server-side rate limit even at 8+ workspace tenants.
+    const interval = setInterval(fetchComms, 30000);
+    return () => clearInterval(interval);
+  }, [fetchComms, visible]);

  if (!visible || comms.length === 0) {
    return (
@@ -1,175 +0,0 @@
-"use client";
-
-/**
- * PurchaseSuccessModal — demo-only post-purchase confirmation.
- *
- * Mounted on the canvas root (`app/page.tsx`). On first paint it inspects
- * `?purchase_success=1[&item=<name>]` on the current URL. If present, it
- * renders a centred modal styled after `ConfirmDialog`, schedules a 5s
- * auto-dismiss, and rewrites the URL via `history.replaceState` to drop
- * the params so a refresh after dismiss does NOT re-show the modal.
- *
- * Mock for the funding demo — there is no real billing surface behind
- * this. The marketplace "Purchase" button on the landing page redirects
- * here with the params; this modal is the only thing the user sees of
- * the "transaction".
- *
- * Styling matches the warm-paper @theme tokens (surface-sunken / line /
- * ink / good) so it tracks light + dark without per-mode overrides.
- */
-
-import { useEffect, useRef, useState } from "react";
-import { createPortal } from "react-dom";
-
-const AUTO_DISMISS_MS = 5000;
-
-function readPurchaseParams(): { open: boolean; item: string | null } {
-  if (typeof window === "undefined") return { open: false, item: null };
-  const sp = new URLSearchParams(window.location.search);
-  const flag = sp.get("purchase_success");
-  if (flag !== "1" && flag !== "true") return { open: false, item: null };
-  return { open: true, item: sp.get("item") };
-}
-
-function stripPurchaseParams() {
-  if (typeof window === "undefined") return;
-  const url = new URL(window.location.href);
-  url.searchParams.delete("purchase_success");
-  url.searchParams.delete("item");
-  // replaceState (not pushState) so back-button doesn't return to the
-  // pre-strip URL and re-trigger the modal.
-  window.history.replaceState({}, "", url.toString());
-}
-
-export function PurchaseSuccessModal() {
-  const [open, setOpen] = useState(false);
-  const [item, setItem] = useState<string | null>(null);
-  const [mounted, setMounted] = useState(false);
-  const dialogRef = useRef<HTMLDivElement>(null);
-
-  // Read the URL params once on mount. We don't subscribe to navigation —
-  // this modal is a one-shot for the demo redirect, not a persistent
-  // listener.
-  useEffect(() => {
-    setMounted(true);
-    const { open: shouldOpen, item: itemName } = readPurchaseParams();
-    if (shouldOpen) {
-      setOpen(true);
-      setItem(itemName);
-      // Clean the URL immediately so a refresh after the modal is closed
-      // (or even while it's still open) does NOT re-trigger it.
-      stripPurchaseParams();
-    }
-  }, []);
-
-  // Auto-dismiss timer + Escape handler.
-  useEffect(() => {
-    if (!open) return;
-    const t = window.setTimeout(() => setOpen(false), AUTO_DISMISS_MS);
-    const onKey = (e: KeyboardEvent) => {
-      if (e.key === "Escape") setOpen(false);
-    };
-    window.addEventListener("keydown", onKey);
-    // Focus the close button so keyboard users land on it after redirect.
-    const raf = requestAnimationFrame(() => {
-      dialogRef.current?.querySelector<HTMLButtonElement>("button")?.focus();
-    });
-    return () => {
-      window.clearTimeout(t);
-      window.removeEventListener("keydown", onKey);
-      cancelAnimationFrame(raf);
-    };
-  }, [open]);
-
-  if (!open || !mounted) return null;
-
-  const itemLabel = item ? decodeURIComponent(item) : "Your new agent";
-
-  return createPortal(
-    <div
-      className="fixed inset-0 z-[9999] flex items-center justify-center"
-      data-testid="purchase-success-modal"
-    >
-      {/* Backdrop — click closes, matches ConfirmDialog backdrop. */}
-      <div
-        className="absolute inset-0 bg-black/60 backdrop-blur-sm"
-        onClick={() => setOpen(false)}
-        aria-hidden="true"
-      />
-
-      <div
-        ref={dialogRef}
-        role="dialog"
-        aria-modal="true"
-        aria-labelledby="purchase-success-title"
-        className="relative bg-surface-sunken border border-line rounded-xl shadow-2xl shadow-black/50 max-w-[420px] w-full mx-4 overflow-hidden"
-      >
-        <div className="px-6 pt-6 pb-4">
-          <div className="flex items-start gap-4">
-            {/* Success glyph — uses --color-good so it tracks the theme.
-                Inline SVG over an emoji so it stays readable + on-brand
-                in both light and dark. */}
-            <div
-              className="flex h-10 w-10 flex-shrink-0 items-center justify-center rounded-full"
-              style={{
-                background:
-                  "color-mix(in srgb, var(--color-good) 15%, transparent)",
-                color: "var(--color-good)",
-              }}
-            >
-              <svg
-                width="22"
-                height="22"
-                viewBox="0 0 24 24"
-                fill="none"
-                aria-hidden="true"
-              >
-                <circle
-                  cx="12"
-                  cy="12"
-                  r="10"
-                  stroke="currentColor"
-                  strokeWidth="1.5"
-                />
-                <path
-                  d="M7.5 12.5L10.5 15.5L16.5 9.5"
-                  stroke="currentColor"
-                  strokeWidth="1.8"
-                  strokeLinecap="round"
-                  strokeLinejoin="round"
-                />
-              </svg>
-            </div>
-            <div className="flex-1">
-              <h3
-                id="purchase-success-title"
-                className="text-base font-semibold text-ink"
-              >
-                Purchase successful
-              </h3>
-              <p className="mt-1.5 text-[13px] leading-relaxed text-ink-mid">
-                <span className="font-medium text-ink">{itemLabel}</span> has
-                been added to your workspace. Provisioning starts in the
-                background — you can keep working while it spins up.
-              </p>
-            </div>
-          </div>
-        </div>
-
-        <div className="flex items-center justify-between gap-3 px-6 py-3 border-t border-line bg-surface/50">
-          <span className="font-mono text-[10.5px] uppercase tracking-[0.12em] text-ink-soft">
-            auto-dismiss · {AUTO_DISMISS_MS / 1000}s
-          </span>
-          <button
-            type="button"
-            onClick={() => setOpen(false)}
-            className="px-3.5 py-1.5 text-[13px] rounded-lg bg-accent hover:bg-accent-strong text-white transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:ring-offset-surface-sunken focus-visible:ring-accent/60"
-          >
-            Close
-          </button>
-        </div>
-      </div>
-    </div>,
-    document.body,
-  );
-}
@@ -41,10 +41,6 @@ vi.mock("@/store/canvas", () => ({
 // ── Imports (after mocks) ─────────────────────────────────────────────────────

 import { api } from "@/lib/api";
-import {
-  emitSocketEvent,
-  _resetSocketEventListenersForTests,
-} from "@/store/socket-events";
 import {
  buildA2AEdges,
  formatA2ARelativeTime,
@@ -346,151 +342,6 @@ describe("A2ATopologyOverlay component", () => {
    expect(mockGet.mock.calls.length).toBe(callsAfterMount);
  });

-  // ── #61 Stage 2: ACTIVITY_LOGGED subscription tests ────────────────────────
-  //
-  // Pin the post-#61 behaviour: WS push for delegation contributes to
-  // the overlay's edge buffer with NO additional HTTP fetch. Same shape
-  // as Stage 1 (CommunicationOverlay).
-
-  describe("#61 stage 2 — ACTIVITY_LOGGED subscription", () => {
-    beforeEach(() => {
-      _resetSocketEventListenersForTests();
-    });
-    afterEach(() => {
-      _resetSocketEventListenersForTests();
-    });
-
-    function emitDelegation(overrides: {
-      workspaceId?: string;
-      sourceId?: string;
-      targetId?: string;
-      method?: string;
-      activityType?: string;
-    } = {}) {
-      // Use Date.now() (real time, fake-timer-frozen) rather than the
-      // hardcoded NOW constant — buildA2AEdges prunes by Date.now() -
-      // A2A_WINDOW_MS, so a row dated against the wrong epoch silently
-      // falls outside the window and the test fails for a confusing
-      // reason ("edges array empty" vs "filter dropped my row").
-      const realNow = Date.now();
-      emitSocketEvent({
-        event: "ACTIVITY_LOGGED",
-        workspace_id: overrides.workspaceId ?? "ws-a",
-        timestamp: new Date(realNow).toISOString(),
-        payload: {
-          id: `act-${Math.random().toString(36).slice(2)}`,
-          activity_type: overrides.activityType ?? "delegation",
-          method: overrides.method ?? "delegate",
-          source_id: overrides.sourceId ?? "ws-a",
-          target_id: overrides.targetId ?? "ws-b",
-          status: "ok",
-          created_at: new Date(realNow - 30_000).toISOString(),
-        },
-      });
-    }
-
-    it("does NOT poll on a 60s interval after bootstrap (post-#61)", async () => {
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      mockGet.mockResolvedValue([] as any);
-      render(<A2ATopologyOverlay />);
-      await act(async () => { await Promise.resolve(); });
-      const callsAfterBootstrap = mockGet.mock.calls.length;
-      expect(callsAfterBootstrap).toBe(2); // ws-a + ws-b
-
-      // Pre-#61: a 60s clock tick would fire a fresh fan-out (2 more
-      // calls). Post-#61: no interval, no extra calls.
-      await act(async () => {
-        vi.advanceTimersByTime(120_000);
-      });
-      expect(mockGet.mock.calls.length).toBe(callsAfterBootstrap);
-    });
-
-    it("WS push for a delegation event from a visible workspace updates edges with NO HTTP call", async () => {
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      mockGet.mockResolvedValue([] as any);
-      render(<A2ATopologyOverlay />);
-      await act(async () => { await Promise.resolve(); await Promise.resolve(); });
-      mockGet.mockClear();
-      mockStoreState.setA2AEdges.mockClear();
-
-      await act(async () => {
-        emitDelegation({ sourceId: "ws-a", targetId: "ws-b" });
-      });
-
-      // Edges-set called with at least one a2a edge for the new push.
-      const calls = mockStoreState.setA2AEdges.mock.calls;
-      expect(calls.length).toBeGreaterThanOrEqual(1);
-      const lastCall = calls[calls.length - 1][0] as Array<{ id: string }>;
-      expect(lastCall.some((e) => e.id === "a2a-ws-a-ws-b")).toBe(true);
-
-      // Critical: no HTTP fetch fired during the WS path.
-      expect(mockGet).not.toHaveBeenCalled();
-    });
-
-    it("WS push for a non-delegation activity_type is ignored", async () => {
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      mockGet.mockResolvedValue([] as any);
-      render(<A2ATopologyOverlay />);
-      await act(async () => { await Promise.resolve(); });
-      mockStoreState.setA2AEdges.mockClear();
-
-      await act(async () => {
-        emitDelegation({ activityType: "a2a_send" });
-      });
-
-      // setA2AEdges must not be called by the WS handler — the only
-      // setA2AEdges calls in this test came from the initial bootstrap.
-      expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
-    });
-
-    it("WS push for a delegate_result row is ignored (mirrors buildA2AEdges filter)", async () => {
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      mockGet.mockResolvedValue([] as any);
-      render(<A2ATopologyOverlay />);
-      await act(async () => { await Promise.resolve(); });
-      mockStoreState.setA2AEdges.mockClear();
-
-      await act(async () => {
-        emitDelegation({ method: "delegate_result" });
-      });
-
-      // delegate_result rows do not contribute to the edge count — they
-      // are completion signals, not initiations.
-      expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
-    });
-
-    it("WS push from a hidden workspace is ignored", async () => {
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      mockGet.mockResolvedValue([] as any);
-      render(<A2ATopologyOverlay />);
-      await act(async () => { await Promise.resolve(); });
-      mockStoreState.setA2AEdges.mockClear();
-
-      await act(async () => {
-        emitDelegation({ workspaceId: "ws-hidden" });
-      });
-
-      expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
-    });
-
-    it("WS push while showA2AEdges is false is ignored", async () => {
-      mockStoreState.showA2AEdges = false;
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      mockGet.mockResolvedValue([] as any);
-      render(<A2ATopologyOverlay />);
-      // The mount path with showA2AEdges=false calls setA2AEdges([])
-      // once — clear that to isolate the WS path.
-      mockStoreState.setA2AEdges.mockClear();
-
-      await act(async () => {
-        emitDelegation();
-      });
-
-      expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
-      expect(mockGet).not.toHaveBeenCalled();
-    });
-  });
-
  it("re-fetches when the visible ID set actually changes", async () => {
    // eslint-disable-next-line @typescript-eslint/no-explicit-any
    mockGet.mockResolvedValue([] as any);
@@ -36,10 +36,6 @@ vi.mock("@/hooks/useWorkspaceName", () => ({
  useWorkspaceName: () => () => "Test WS",
 }));

-import {
-  emitSocketEvent,
-  _resetSocketEventListenersForTests,
-} from "@/store/socket-events";
 import { ActivityTab } from "../tabs/ActivityTab";

 // ── Fixtures ──────────────────────────────────────────────────────────────────
@@ -362,191 +358,6 @@ describe("ActivityTab — refresh button", () => {
  });
 });

-// ── Suite 6.5: ACTIVITY_LOGGED subscription (#61 stage 3) ─────────────────────
-//
-// Pin the post-#61 behaviour: WS push extends the rendered list with NO
-// additional HTTP fetch. The 5s polling loop is gone; live updates
-// arrive over the WebSocket bus.
-
-describe("ActivityTab — #61 stage 3: ACTIVITY_LOGGED subscription", () => {
-  beforeEach(() => {
-    vi.clearAllMocks();
-    mockGet.mockResolvedValue([]);
-    _resetSocketEventListenersForTests();
-  });
-  afterEach(() => {
-    cleanup();
-    _resetSocketEventListenersForTests();
-  });
-
-  function emitActivity(overrides: {
-    workspaceId?: string;
-    activityType?: string;
-    summary?: string;
-    id?: string;
-  } = {}) {
-    const realNow = Date.now();
-    emitSocketEvent({
-      event: "ACTIVITY_LOGGED",
-      workspace_id: overrides.workspaceId ?? "ws-1",
-      timestamp: new Date(realNow).toISOString(),
-      payload: {
-        id: overrides.id ?? `act-${Math.random().toString(36).slice(2)}`,
-        activity_type: overrides.activityType ?? "agent_log",
-        source_id: null,
-        target_id: null,
-        method: null,
-        summary: overrides.summary ?? "live-pushed",
-        status: "ok",
-        created_at: new Date(realNow - 5_000).toISOString(),
-      },
-    });
-  }
-
-  it("WS push for matching workspace prepends to the list with NO HTTP call", async () => {
-    render(<ActivityTab workspaceId="ws-1" />);
-    await waitFor(() => {
-      expect(screen.getByText(/0 activities|no activity/i)).toBeTruthy();
-    });
-    mockGet.mockClear();
-
-    await act(async () => {
-      emitActivity({ summary: "live-row-from-bus" });
-    });
-
-    await waitFor(() => {
-      expect(screen.getByText(/live-row-from-bus/)).toBeTruthy();
-    });
-    expect(mockGet).not.toHaveBeenCalled();
-  });
-
-  it("WS push for a different workspace is ignored", async () => {
-    render(<ActivityTab workspaceId="ws-1" />);
-    await waitFor(() => screen.getByText(/no activity/i));
-
-    await act(async () => {
-      emitActivity({
-        workspaceId: "ws-other",
-        summary: "should-not-render-other-ws",
-      });
-    });
-
-    expect(screen.queryByText(/should-not-render-other-ws/)).toBeNull();
-  });
-
-  it("WS push respects the active filter — non-matching activity_type is ignored", async () => {
-    render(<ActivityTab workspaceId="ws-1" />);
-    await waitFor(() => screen.getByText(/no activity/i));
-
-    // Apply "Tasks" filter.
-    clickButton(/tasks/i);
-    await waitFor(() => {
-      expect(
-        screen.getByRole("button", { name: /tasks/i }).getAttribute("aria-pressed"),
-      ).toBe("true");
-    });
-
-    // Push an a2a_send (does NOT match task_update filter).
-    await act(async () => {
-      emitActivity({
-        activityType: "a2a_send",
-        summary: "should-not-render-filter-mismatch",
-      });
-    });
-
-    expect(
-      screen.queryByText(/should-not-render-filter-mismatch/),
-    ).toBeNull();
-  });
-
-  it("WS push respects the active filter — matching activity_type is rendered", async () => {
-    render(<ActivityTab workspaceId="ws-1" />);
-    await waitFor(() => screen.getByText(/no activity/i));
-
-    clickButton(/tasks/i);
-    await waitFor(() => {
-      expect(
-        screen.getByRole("button", { name: /tasks/i }).getAttribute("aria-pressed"),
-      ).toBe("true");
-    });
-
-    await act(async () => {
-      emitActivity({
-        activityType: "task_update",
-        summary: "task-filter-match",
-      });
-    });
-
-    await waitFor(() => {
-      expect(screen.getByText(/task-filter-match/)).toBeTruthy();
-    });
-  });
-
-  it("WS push while autoRefresh is paused is ignored", async () => {
-    render(<ActivityTab workspaceId="ws-1" />);
-    await waitFor(() => screen.getByText(/no activity/i));
-
-    // Toggle Live → Paused.
-    clickButton(/live/i);
-    await waitFor(() => {
-      expect(screen.getByText(/Paused/)).toBeTruthy();
-    });
-
-    await act(async () => {
-      emitActivity({ summary: "should-not-render-paused" });
-    });
-
-    expect(screen.queryByText(/should-not-render-paused/)).toBeNull();
-  });
-
-  it("WS push for a row already in the list is deduped (no double-render)", async () => {
-    // Bootstrap with one row — same id as the WS push to trigger dedup.
-    mockGet.mockResolvedValueOnce([
-      makeEntry({ id: "shared-id", summary: "bootstrap-summary" }),
-    ]);
-    render(<ActivityTab workspaceId="ws-1" />);
-    await waitFor(() => {
-      expect(screen.getByText(/bootstrap-summary/)).toBeTruthy();
-    });
-    mockGet.mockClear();
-
-    // Push a row with the SAME id but a different summary — must not
-    // render the new summary; original row stays.
-    await act(async () => {
-      emitActivity({
-        id: "shared-id",
-        summary: "should-not-replace-existing",
-      });
-    });
-
-    expect(screen.queryByText(/should-not-replace-existing/)).toBeNull();
-    // Also verify count didn't grow.
-    expect(screen.getByText(/1 activities/)).toBeTruthy();
-  });
-
-  it("does NOT poll on a 5s interval after mount (post-#61)", async () => {
-    vi.useFakeTimers();
-    try {
-      render(<ActivityTab workspaceId="ws-1" />);
-      // Drain the mount-time bootstrap promise.
-      await act(async () => {
-        await Promise.resolve();
-        await Promise.resolve();
-      });
-      const callsAfterBootstrap = mockGet.mock.calls.length;
-      expect(callsAfterBootstrap).toBeGreaterThanOrEqual(1);
-
-      // Pre-#61: a 30s clock advance fires 6 more polls. Post-#61: 0.
-      await act(async () => {
-        vi.advanceTimersByTime(30_000);
-      });
-      expect(mockGet.mock.calls.length).toBe(callsAfterBootstrap);
-    } finally {
-      vi.useRealTimers();
-    }
-  });
-});
-
 // ── Suite 7: Activity count ───────────────────────────────────────────────────

 describe("ActivityTab — activity count", () => {
@@ -1,28 +1,18 @@
 // @vitest-environment jsdom
 /**
- * CommunicationOverlay tests — pin both the 2026-05-04 fan-out cap fix
- * AND the 2026-05-07 polling → ACTIVITY_LOGGED-subscriber refactor
- * (issue #61 stage 1).
+ * CommunicationOverlay tests — pin the rate-limit fix shipped 2026-05-04.
 *
- * The overlay used to poll /workspaces/:id/activity?limit=5 on a 30s
- * interval per online workspace (capped at 3). Post-#61: it bootstraps
- * once on mount via the same HTTP path (cap of 3 retained), then
- * subscribes to ACTIVITY_LOGGED via the global socket bus for live
- * updates. No interval poll.
+ * The overlay polls /workspaces/:id/activity?limit=5 for each online
+ * workspace. Pre-fix it (a) polled regardless of visibility and (b)
+ * fanned out to 6 workspaces every 10s. With 8+ workspaces a user
+ * triggered sustained 429s (server-side rate limit is 600 req/min/IP).
 *
 * These tests pin:
- *  1. Bootstrap fan-out cap of 3 — even with 6 online nodes, only 3
- *     HTTP fetches on mount.
- *  2. Visibility gate — when collapsed, no HTTP fetches; re-open
- *     re-bootstraps.
- *  3. NO interval polling — advancing the clock past 30s does not fire
- *     additional HTTP calls.
- *  4. WS push extends the rendered list without firing any HTTP call.
- *  5. WS push for an offline workspace is ignored.
- *  6. WS push for a non-comm activity_type is ignored.
+ *  1. Fan-out cap of 3 — even with 6 online nodes, only 3 fetches
+ *  2. Visibility gate — when collapsed, no polling
 *
- * If a future refactor regresses any of these, CI fails before the
- * regression hits a paying tenant.
+ * If a future refactor pushes either dial back up, CI fails before
+ * the regression hits a paying tenant.
 */
 import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
 import { render, cleanup, act, fireEvent } from "@testing-library/react";
@@ -33,7 +23,7 @@ vi.mock("@/lib/api", () => ({
  api: { get: vi.fn() },
 }));

-// Six online nodes — enough to verify the bootstrap cap of 3.
+// Six online nodes — enough to verify the cap of 3.
 const mockStoreState = {
  selectedNodeId: null as string | null,
  nodes: [
@@ -66,10 +56,6 @@ vi.mock("@/lib/design-tokens", () => ({
 // ── Imports (after mocks) ─────────────────────────────────────────────────────

 import { api } from "@/lib/api";
-import {
-  emitSocketEvent,
-  _resetSocketEventListenersForTests,
-} from "@/store/socket-events";
 import { CommunicationOverlay } from "../CommunicationOverlay";

 const mockGet = vi.mocked(api.get);
@@ -80,34 +66,30 @@ beforeEach(() => {
  vi.useFakeTimers();
  mockGet.mockReset();
  mockGet.mockResolvedValue([]);
-  // Drop any subscribers the previous test left on the singleton bus —
-  // each render adds one via useSocketEvent.
-  _resetSocketEventListenersForTests();
 });

 afterEach(() => {
  cleanup();
  vi.useRealTimers();
-  _resetSocketEventListenersForTests();
 });

 // ── Tests ─────────────────────────────────────────────────────────────────────

-describe("CommunicationOverlay — bootstrap fan-out cap", () => {
-  it("bootstraps at most 3 of 6 online workspaces (rate-limit floor preserved post-#61)", async () => {
+describe("CommunicationOverlay — fan-out cap", () => {
+  it("polls at most 3 of 6 online workspaces (rate-limit floor)", async () => {
    await act(async () => {
      render(<CommunicationOverlay />);
    });
-    // Mount fires the bootstrap synchronously — pre-#61 this was the
-    // first poll cycle; post-#61 it's the only HTTP fetch (live updates
-    // arrive via WS push). 6 nodes → 3 fetches.
+    // Mount fires the first poll synchronously (no interval tick yet).
+    // Pre-fix: 6 calls. Post-fix: 3.
    expect(mockGet).toHaveBeenCalledTimes(3);
+    // Verify the calls are for the FIRST 3 online nodes (slice order).
    expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-1/activity?limit=5");
    expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-2/activity?limit=5");
    expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-3/activity?limit=5");
  });

-  it("never bootstraps offline workspaces", async () => {
+  it("never polls offline workspaces", async () => {
    await act(async () => {
      render(<CommunicationOverlay />);
    });
@@ -117,39 +99,40 @@ describe("CommunicationOverlay — bootstrap fan-out cap", () => {
  });
 });

-describe("CommunicationOverlay — no interval polling (post-#61)", () => {
-  // The pre-#61 implementation re-fetched every 30s per workspace.
-  // Post-#61 the only HTTP path is the bootstrap on mount + on
-  // visibility-toggle. This test pins the absence of any interval
-  // poll: a 60s clock advance must not produce a second round of
-  // fetches.
-  it("does NOT poll on a 30s interval after bootstrap", async () => {
+describe("CommunicationOverlay — cadence", () => {
+  it("uses 30s interval cadence (was 10s pre-fix)", async () => {
    await act(async () => {
      render(<CommunicationOverlay />);
    });
-    expect(mockGet).toHaveBeenCalledTimes(3); // initial bootstrap
-    mockGet.mockClear();
+    expect(mockGet).toHaveBeenCalledTimes(3); // initial mount poll

-    // Advance 60s — well past any plausible cadence the prior version
-    // could have used.
+    // Advance 10s — pre-fix this would fire another poll. Post-fix: silent.
    await act(async () => {
-      vi.advanceTimersByTime(60_000);
+      vi.advanceTimersByTime(10_000);
    });
-    expect(mockGet).not.toHaveBeenCalled();
+    expect(mockGet).toHaveBeenCalledTimes(3);
+
+    // Advance to 30s — interval fires.
+    await act(async () => {
+      vi.advanceTimersByTime(20_000);
+    });
+    expect(mockGet).toHaveBeenCalledTimes(6); // +3 from second tick
  });
 });

 describe("CommunicationOverlay — visibility gate", () => {
-  // The visibility gate now does two things post-#61:
-  //   - while closed, the WS handler short-circuits (no setComms churn)
-  //   - re-opening triggers a fresh bootstrap so the list reflects
-  //     anything that happened while the panel was collapsed
+  // The visibility gate is the dial that drops collapsed-panel polling
+  // to ZERO. The cadence test above can't catch its removal — if a
+  // refactor dropped `if (!visible) return`, the cadence test would
+  // still pass because the effect would still fire every 30s.
  //
  // Direct probe: render with comms-returning mock so the panel
  // actually renders (close button only exists in the expanded panel,
  // not the collapsed button-state). Click close, advance the clock,
  // assert no further fetches.
-  it("stops fetching while collapsed and re-bootstraps on re-open", async () => {
+  it("stops polling after the user collapses the panel", async () => {
+    // Mock returns one a2a_send so comms.length > 0 → panel renders →
+    // close button accessible.
    mockGet.mockResolvedValue([
      {
        id: "act-1",
@@ -167,202 +150,29 @@ describe("CommunicationOverlay — visibility gate", () => {
    const { getByLabelText } = await act(async () => {
      return render(<CommunicationOverlay />);
    });
-    // Drain pending microtasks (resolves the await in bootstrap) so
-    // setComms lands and the panel renders. Don't advance time — it's
-    // not load-bearing for the gate test, but matches the pattern used
-    // pre-#61 for stability.
+    // Drain pending microtasks (resolves the await in fetchComms) so
+    // setComms lands and the panel renders. Don't advance time — that
+    // would fire the next interval tick and pollute the assertion.
    await act(async () => {
      await Promise.resolve();
      await Promise.resolve();
      await Promise.resolve();
    });
-    expect(mockGet).toHaveBeenCalledTimes(3); // initial bootstrap
+    // Initial mount polled 3 workspaces.
+    expect(mockGet).toHaveBeenCalledTimes(3);
    mockGet.mockClear();

-    // Click close. While closed, no fetches and no WS-driven updates.
+    // Click the close button. Synchronous getByLabelText avoids
+    // findBy's internal setTimeout (deadlocks under useFakeTimers).
    const closeBtn = getByLabelText("Close communications panel");
    await act(async () => {
      fireEvent.click(closeBtn);
    });
+
+    // Advance well past the 30s cadence — gate should suppress the tick.
    await act(async () => {
      vi.advanceTimersByTime(60_000);
    });
    expect(mockGet).not.toHaveBeenCalled();
-
-    // Re-open via the collapsed button. Must trigger a fresh bootstrap.
-    const openBtn = getByLabelText("Show communications panel");
-    await act(async () => {
-      fireEvent.click(openBtn);
-    });
-    await act(async () => {
-      await Promise.resolve();
-      await Promise.resolve();
-    });
-    expect(mockGet).toHaveBeenCalledTimes(3); // re-bootstrap on re-open
-  });
-});
-
-describe("CommunicationOverlay — WS subscription (#61 stage 1 core)", () => {
-  // The load-bearing post-#61 behaviour. Every test in this block must
-  // verify (a) the WS push DID update the rendered comms list, and
-  // (b) NO additional HTTP call was fired — the whole point of the
-  // refactor is to remove the polling-driven HTTP traffic.
-  function emitActivityLogged(overrides: Partial<{
-    workspaceId: string;
-    payload: Record<string, unknown>;
-  }> = {}) {
-    emitSocketEvent({
-      event: "ACTIVITY_LOGGED",
-      workspace_id: overrides.workspaceId ?? "ws-1",
-      timestamp: new Date().toISOString(),
-      payload: {
-        id: `act-${Math.random().toString(36).slice(2)}`,
-        activity_type: "a2a_send",
-        source_id: "ws-1",
-        target_id: "ws-2",
-        summary: "live push",
-        status: "ok",
-        duration_ms: 42,
-        created_at: new Date().toISOString(),
-        ...overrides.payload,
-      },
-    });
-  }
-
-  it("WS push for a comm activity_type extends the rendered list with NO additional HTTP call", async () => {
-    const { container } = await act(async () => {
-      return render(<CommunicationOverlay />);
-    });
-    expect(mockGet).toHaveBeenCalledTimes(3); // bootstrap
-    mockGet.mockClear();
-
-    await act(async () => {
-      emitActivityLogged({ payload: { summary: "hello" } });
-    });
-    await act(async () => {
-      await Promise.resolve();
-    });
-
-    // Two pins:
-    //   1. comms list reflects the live push (look for the summary text)
-    //   2. zero HTTP fetches fired during the WS path
-    expect(container.textContent).toContain("hello");
-    expect(mockGet).not.toHaveBeenCalled();
-  });
-
-  it("WS push for an offline workspace is ignored", async () => {
-    const { container } = await act(async () => {
-      return render(<CommunicationOverlay />);
-    });
-    mockGet.mockClear();
-
-    await act(async () => {
-      emitActivityLogged({
-        workspaceId: "ws-offline",
-        payload: { source_id: "ws-offline", summary: "should-not-render" },
-      });
-    });
-    await act(async () => {
-      await Promise.resolve();
-    });
-
-    expect(container.textContent).not.toContain("should-not-render");
-    expect(mockGet).not.toHaveBeenCalled();
-  });
-
-  it("WS push for a non-comm activity_type is ignored (e.g. delegation)", async () => {
-    const { container } = await act(async () => {
-      return render(<CommunicationOverlay />);
-    });
-    mockGet.mockClear();
-
-    await act(async () => {
-      emitActivityLogged({
-        payload: {
-          activity_type: "delegation",
-          summary: "should-not-render-delegation",
-        },
-      });
-    });
-    await act(async () => {
-      await Promise.resolve();
-    });
-
-    expect(container.textContent).not.toContain("should-not-render-delegation");
-    expect(mockGet).not.toHaveBeenCalled();
-  });
-
-  it("WS push while the panel is collapsed is ignored (no churn on hidden state)", async () => {
-    // Bootstrap with one comm so the panel renders → close button
-    // accessible. Then collapse, emit a WS push, re-open: the rendered
-    // list must come from the re-bootstrap, NOT from the WS-push that
-    // arrived during the closed state. Also: nothing visible while
-    // closed (the collapsed button shows only the count, not summaries).
-    mockGet.mockResolvedValue([
-      {
-        id: "act-bootstrap",
-        workspace_id: "ws-1",
-        activity_type: "a2a_send",
-        source_id: "ws-1",
-        target_id: "ws-2",
-        summary: "bootstrap-summary",
-        status: "ok",
-        duration_ms: 1,
-        created_at: new Date().toISOString(),
-      },
-    ]);
-    const { getByLabelText, container } = await act(async () => {
-      return render(<CommunicationOverlay />);
-    });
-    await act(async () => {
-      await Promise.resolve();
-      await Promise.resolve();
-    });
-
-    // Collapse.
-    const closeBtn = getByLabelText("Close communications panel");
-    await act(async () => {
-      fireEvent.click(closeBtn);
-    });
-
-    // Bootstrap mock returns nothing on the re-open path so we can
-    // distinguish "WS push leaked through the gate" from "re-bootstrap
-    // refilled the list."
-    mockGet.mockReset();
-    mockGet.mockResolvedValue([]);
-
-    await act(async () => {
-      emitActivityLogged({
-        payload: { summary: "leaked-while-closed" },
-      });
-    });
-    await act(async () => {
-      await Promise.resolve();
-    });
-
-    // Closed state: rendered DOM must not show any push-derived text.
-    expect(container.textContent).not.toContain("leaked-while-closed");
-  });
-
-  it("non-ACTIVITY_LOGGED events are ignored (e.g. WORKSPACE_OFFLINE)", async () => {
-    const { container } = await act(async () => {
-      return render(<CommunicationOverlay />);
-    });
-    mockGet.mockClear();
-
-    await act(async () => {
-      emitSocketEvent({
-        event: "WORKSPACE_OFFLINE",
-        workspace_id: "ws-1",
-        timestamp: new Date().toISOString(),
-        payload: { summary: "should-not-render-event" },
-      });
-    });
-    await act(async () => {
-      await Promise.resolve();
-    });
-
-    expect(container.textContent).not.toContain("should-not-render-event");
-    expect(mockGet).not.toHaveBeenCalled();
  });
 });
@@ -1,9 +1,8 @@
 "use client";

-import { useState, useEffect, useCallback, useRef } from "react";
+import { useState, useEffect, useCallback } from "react";
 import { api } from "@/lib/api";
 import { ConversationTraceModal } from "@/components/ConversationTraceModal";
-import { useSocketEvent } from "@/hooks/useSocketEvent";
 import { type ActivityEntry } from "@/types/activity";
 import { useWorkspaceName } from "@/hooks/useWorkspaceName";
 import { inferA2AErrorHint } from "./chat/a2aErrorHint";
@@ -49,15 +48,6 @@ export function ActivityTab({ workspaceId }: Props) {
  const [traceOpen, setTraceOpen] = useState(false);
  const resolveName = useWorkspaceName();

-  // Refs let the WS handler read the latest filter / autoRefresh
-  // selection without re-subscribing on every state change. The bus
-  // listener is registered exactly once per mount via useSocketEvent's
-  // ref-internal pattern; subscriber-side filtering reads from these.
-  const filterRef = useRef(filter);
-  filterRef.current = filter;
-  const autoRefreshRef = useRef(autoRefresh);
-  autoRefreshRef.current = autoRefresh;
-
  const loadActivities = useCallback(async () => {
    try {
      const typeParam = filter !== "all" ? `?type=${filter}` : "";
@@ -76,58 +66,11 @@ export function ActivityTab({ workspaceId }: Props) {
    loadActivities();
  }, [loadActivities]);

-  // Live-update path (issue #61 stage 3, replaces the 5s setInterval).
-  // ACTIVITY_LOGGED events from this workspace prepend to the rendered
-  // list — dedup by id so a server-side update + a poll reply don't
-  // double-render the same row.
-  //
-  // Honours the user's autoRefresh toggle: when paused, live updates
-  // are dropped until the user re-enables Live (or hits Refresh, which
-  // re-bootstraps via loadActivities).
-  //
-  // Filter awareness: matches the server-side `?type=<filter>`
-  // semantics so the panel doesn't show rows the user excluded.
-  useSocketEvent((msg) => {
-    if (!autoRefreshRef.current) return;
-    if (msg.event !== "ACTIVITY_LOGGED") return;
-    if (msg.workspace_id !== workspaceId) return;
-
-    const p = (msg.payload || {}) as Record<string, unknown>;
-    const activityType = (p.activity_type as string) || "";
-
-    const f = filterRef.current;
-    if (f !== "all" && activityType !== f) return;
-
-    const entry: ActivityEntry = {
-      id:
-        (p.id as string) ||
-        `ws-push-${msg.timestamp || Date.now()}-${msg.workspace_id}`,
-      workspace_id: msg.workspace_id,
-      activity_type: activityType,
-      source_id: (p.source_id as string | null) ?? null,
-      target_id: (p.target_id as string | null) ?? null,
-      method: (p.method as string | null) ?? null,
-      summary: (p.summary as string | null) ?? null,
-      request_body: (p.request_body as Record<string, unknown> | null) ?? null,
-      response_body:
-        (p.response_body as Record<string, unknown> | null) ?? null,
-      duration_ms: (p.duration_ms as number | null) ?? null,
-      status: (p.status as string) || "ok",
-      error_detail: (p.error_detail as string | null) ?? null,
-      created_at:
-        (p.created_at as string) ||
-        msg.timestamp ||
-        new Date().toISOString(),
-    };
-
-    setActivities((prev) => {
-      // Dedup by id — a row that arrived via the bootstrap fetch and
-      // also fires ACTIVITY_LOGGED from a delayed server-side hook
-      // must render exactly once.
-      if (prev.some((e) => e.id === entry.id)) return prev;
-      return [entry, ...prev];
-    });
-  });
+  useEffect(() => {
+    if (!autoRefresh) return;
+    const interval = setInterval(loadActivities, 5000);
+    return () => clearInterval(interval);
+  }, [loadActivities, autoRefresh]);

  return (
    <div className="flex flex-col h-full">
@@ -13,7 +13,6 @@ import { AttachmentPreview } from "./chat/AttachmentPreview";
 import { extractFilesFromTask } from "./chat/message-parser";
 import { AgentCommsPanel } from "./chat/AgentCommsPanel";
 import { appendActivityLine } from "./chat/activityLog";
-import { activityRowToMessages, type ActivityRowForHydration } from "./chat/historyHydration";
 import { runtimeDisplayName } from "@/lib/runtime-names";
 import { ConfirmDialog } from "@/components/ConfirmDialog";

@@ -50,38 +49,12 @@ interface A2AResponse {
  };
 }

-/** Detect activity-log rows that the workspace's own runtime fired
- *  against itself but were misclassified as canvas-source. The proper
- *  fix is the X-Workspace-ID header from `self_source_headers()` in
- *  workspace/platform_auth.py, which makes the platform record
- *  source_id = workspace_id. But three failure modes still leak a
- *  self-message into "My Chat":
- *
- *    1. Historical rows already in the DB with source_id=NULL.
- *    2. Workspace containers running pre-fix heartbeat.py / main.py
- *       (the fix only takes effect after an image rebuild + redeploy).
- *    3. Future internal triggers added without the helper.
- *
- *  This client-side filter recognises the heartbeat trigger by its
- *  exact prefix — the heartbeat assembles
- *
- *    "Delegation results are ready. Review them and take appropriate
- *     action:\n" + summary_lines + report_instruction
- *
- *  in workspace/heartbeat.py. The prefix is template-fixed so a
- *  string match is reliable. If the heartbeat copy ever changes,
- *  update this constant in the same commit.
- *
- *  This is a backstop, not the primary defence — the X-Workspace-ID
- *  header is. Filtering content is fragile to copy edits, so keep
- *  the list narrow. */
-const INTERNAL_SELF_MESSAGE_PREFIXES = [
-  "Delegation results are ready. Review them and take appropriate action",
-];
-
-function isInternalSelfMessage(text: string): boolean {
-  return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p));
-}
+// Internal-self-message filtering moved server-side in RFC #2945
+// PR-C/D — the platform's /chat-history endpoint applies the
+// IsInternalSelfMessage predicate before returning rows, so the
+// client no longer needs the local backstop on the history path.
+// The proper fix is still X-Workspace-ID header (source_id=workspace_id);
+// the platform-side prefix filter handles the residual cases.

 // extractReplyText pulls the agent's text reply out of an A2A response.
 // Concatenates ALL text parts (joined with "\n") rather than returning
@@ -134,8 +107,19 @@ const INITIAL_HISTORY_LIMIT = 10;
 const OLDER_HISTORY_BATCH = 20;

 /**
- * Load chat history from the activity_logs database via the platform API.
- * Uses source=canvas to only get user-initiated messages (not agent-to-agent).
+ * Load chat history from the platform's typed /chat-history endpoint.
+ *
+ * Server-side rendering of activity_logs rows into ChatMessage shape
+ * lives in workspace-server/internal/messagestore/postgres_store.go
+ * (RFC #2945 PR-C/D). The server already applies the canvas-source
+ * filter, the internal-self-message predicate, the role decision
+ * (status=error vs agent-error prefix → system), and the v0/v1
+ * file-shape extraction. Canvas just renders what it receives.
+ *
+ * Wire shape (mirrors ChatMessage exactly, no per-row mapping needed):
+ *
+ *   GET /workspaces/:id/chat-history?limit=N&before_ts=T
+ *   200 → {"messages": ChatMessage[], "reached_end": boolean}
 *
 * Pagination:
 *  - Pass `limit` to bound the page size (newest-first from server).
@@ -143,10 +127,10 @@ const OLDER_HISTORY_BATCH = 20;
 *    timestamp. Combined with limit, this yields the next-older page
 *    when scrolling backward through history.
 *
- * `reachedEnd` is true when the server returned fewer rows than asked
- * for — caller uses this to disable further older-batch fetches.
- * (Counts row-level returns, not chat-bubble count: each row may
- * produce 1-2 bubbles.)
+ * `reachedEnd` is propagated from the server. The server computes it
+ * by comparing rowCount vs limit so a partial last page is correctly
+ * detected even when the row→bubble fan-out is non-1:1 (each row
+ * produces 1-2 bubbles).
 */
 async function loadMessagesFromDB(
  workspaceId: string,
@@ -154,25 +138,23 @@ async function loadMessagesFromDB(
  beforeTs?: string,
 ): Promise<{ messages: ChatMessage[]; error: string | null; reachedEnd: boolean }> {
  try {
-    const params = new URLSearchParams({
-      type: "a2a_receive",
-      source: "canvas",
-      limit: String(limit),
-    });
+    const params = new URLSearchParams({ limit: String(limit) });
    if (beforeTs) params.set("before_ts", beforeTs);
-    const activities = await api.get<ActivityRowForHydration[]>(
-      `/workspaces/${workspaceId}/activity?${params.toString()}`,
+    const resp = await api.get<{ messages: ChatMessage[]; reached_end: boolean }>(
+      `/workspaces/${workspaceId}/chat-history?${params.toString()}`,
    );

-    const messages: ChatMessage[] = [];
-    // Activities are newest-first, reverse for chronological order.
-    // Per-row mapping lives in chat/historyHydration.ts so it can be
-    // unit-tested without spinning up the full ChatTab component
-    // (regression cover for the timestamp-collapse bug).
-    for (const a of [...activities].reverse()) {
-      messages.push(...activityRowToMessages(a, isInternalSelfMessage));
-    }
-    return { messages, error: null, reachedEnd: activities.length < limit };
+    // Server emits oldest-first within the page (RFC #2945 PR-C-2
+    // post-fix: server reverses row-aware before returning so the
+    // wire is display-ready). Canvas appends/prepends without
+    // reordering — this avoids the pair-flip bug a naive flat
+    // reverse causes when each row produces a (user, agent) pair
+    // with the same timestamp.
+    return {
+      messages: resp.messages ?? [],
+      error: null,
+      reachedEnd: resp.reached_end,
+    };
  } catch (err) {
    return {
      messages: [],
@@ -21,20 +21,39 @@ interface Props {
 // --- Agent Card Section ---

 function AgentCardSection({ workspaceId }: { workspaceId: string }) {
-  const [card, setCard] = useState<Record<string, unknown> | null>(null);
-  const [loading, setLoading] = useState(true);
+  // Initial card value comes from the canvas store — node.data.agentCard
+  // is hydrated by the platform stream when the workspace appears in the
+  // graph, so reading it here avoids a duplicate `GET /workspaces/${id}`
+  // (the parent ConfigTab.loadConfig already fetches workspace metadata,
+  // and refetching here adds a serialised RTT to the panel-open path —
+  // contributed to the ~20s detail-panel load reported in core#11).
+  // Local state still tracks the edited/saved value so the editor flow
+  // is unchanged.
+  const storeCard = useCanvasStore((s) => {
+    // Defensive against test mocks that omit `nodes` (some test files
+    // stub the store with a minimal shape). In production `nodes` is
+    // always an array — empty or not — so the optional chaining only
+    // matters for the test path.
+    const node = s.nodes?.find?.((n) => n.id === workspaceId);
+    return (node?.data.agentCard as
+      | Record<string, unknown>
+      | null
+      | undefined) ?? null;
+  });
+  const [card, setCard] = useState<Record<string, unknown> | null>(storeCard);
  const [editing, setEditing] = useState(false);
  const [draft, setDraft] = useState("");
  const [saving, setSaving] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [success, setSuccess] = useState(false);

+  // If the store updates while this section is mounted (another tab
+  // pushed an update via the platform event stream), reflect that —
+  // unless the user is mid-edit, in which case we don't clobber their
+  // unsaved draft.
  useEffect(() => {
-    api.get<Record<string, unknown>>(`/workspaces/${workspaceId}`)
-      .then((ws) => setCard((ws.agent_card as Record<string, unknown>) || null))
-      .catch(() => {})
-      .finally(() => setLoading(false));
-  }, [workspaceId]);
+    if (!editing) setCard(storeCard);
+  }, [storeCard, editing]);

  const handleSave = async () => {
    setError(null);
@@ -53,9 +72,7 @@ function AgentCardSection({ workspaceId }: { workspaceId: string }) {

  return (
    <Section title="Agent Card" defaultOpen={false}>
-      {loading ? (
-        <div className="text-[10px] text-ink-soft">Loading...</div>
-      ) : editing ? (
+      {editing ? (
        <div className="space-y-2">
          <textarea
            aria-label="Agent card JSON editor"
@@ -221,47 +238,51 @@ export function ConfigTab({ workspaceId }: Props) {
    setLoading(true);
    setError(null);

-    // ALWAYS load workspace metadata first (runtime + model). These are the
-    // source of truth regardless of whether the runtime uses our config.yaml
-    // template. Without this the form falls back to empty/default values on
-    // a hermes workspace (which doesn't use our template), creating the
-    // appearance that the saved runtime is unset — and worse, clicking Save
-    // would silently flip `runtime` from `hermes` back to the dropdown
-    // default `LangGraph`. See GH #1894.
-    let wsMetadataRuntime = "";
-    let wsMetadataModel = "";
-    let wsMetadataTier: number | null = null;
-    try {
-      const ws = await api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`);
-      wsMetadataRuntime = (ws.runtime || "").trim();
-      if (typeof ws.tier === "number") wsMetadataTier = ws.tier;
-    } catch { /* fall back to config.yaml */ }
-    try {
-      const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
-      wsMetadataModel = (m.model || "").trim();
-    } catch { /* non-fatal */ }
+    // Load workspace metadata (runtime + model + provider) in parallel.
+    // These are independent GETs against three workspace-server endpoints
+    // and used to be awaited serially — for SaaS workspaces each call
+    // round-trips through an EIC SSH tunnel, so the previous serial
+    // pattern stacked 3-5s of tunnel-setup latency per call (core#11).
+    // Promise.all overlaps them; the per-call cost stays the same but
+    // wall time drops to max() instead of sum().
+    //
+    // Each leg has its own .catch handler that yields a sentinel value,
+    // matching the previous semantics:
+    //   - /workspaces/${id}: required source-of-truth for runtime+tier;
+    //     fall back to YAML if the GET fails (rare, network-class only).
+    //   - /workspaces/${id}/model: non-fatal; empty model lets the form
+    //     fall through to YAML runtime_config.model.
+    //   - /workspaces/${id}/provider: non-fatal; old workspace-servers
+    //     return 404, in which case provider="" and Save skips the PUT.
+    //
+    // See GH #1894 for the workspace-row-as-source-of-truth rationale
+    // that motivated splitting from a single config.yaml read.
+    const [wsRes, modelRes, providerRes] = await Promise.all([
+      api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`)
+        .catch(() => ({} as { runtime?: string; tier?: number })),
+      api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`)
+        .catch(() => ({} as { model?: string })),
+      api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`)
+        .catch(() => null),
+    ]);
+    const wsMetadataRuntime = (wsRes.runtime || "").trim();
+    const wsMetadataModel = (modelRes.model || "").trim();
+    const wsMetadataTier: number | null =
+      typeof wsRes.tier === "number" ? wsRes.tier : null;
+    if (providerRes !== null) {
+      const loadedProvider = (providerRes.provider || "").trim();
+      setProvider(loadedProvider);
+      setOriginalProvider(loadedProvider);
+    } else {
+      setProvider("");
+      setOriginalProvider("");
+    }
    // originalModel is set further down once the YAML has been parsed —
    // we want it to reflect what the form ACTUALLY rendered, which may
    // be the YAML's runtime_config.model fallback when MODEL_PROVIDER
    // is empty. Setting it here from wsMetadataModel alone would be
    // wrong for hermes/pre-#240 workspaces.

-    // Load explicit provider override (Option B PR-5). Endpoint returns
-    // {provider: "", source: "default"} when no override is set, so the
-    // empty string is the legitimate "auto-derive" signal — don't treat
-    // it as a load error. Non-fatal: an older workspace-server that
-    // predates PR-2 returns 404 here; the form falls back to "" and
-    // Save just won't PUT the provider field.
-    try {
-      const p = await api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`);
-      const loadedProvider = (p.provider || "").trim();
-      setProvider(loadedProvider);
-      setOriginalProvider(loadedProvider);
-    } catch {
-      setProvider("");
-      setOriginalProvider("");
-    }
-
    // Skip the config.yaml fetch entirely for runtimes that manage
    // their own config (external, hermes, etc.) — they don't have a
    // platform-side template, so the GET would 404. The catch block
@@ -1,13 +1,11 @@
 // @vitest-environment jsdom
 //
-// Pins the lazy-loading chat-history pagination added 2026-05-05.
+// Pins the lazy-loading chat-history pagination.
 //
-// Pre-fix: ChatTab fetched the newest 50 messages on every mount and
-// scrolled to bottom, paying full DOM cost up-front even when the user
-// only wanted to read the last few bubbles. Post-fix: initial load is
-// bounded to 10 newest, and an IntersectionObserver on a top sentinel
-// triggers loadOlder() (batch of 20 with `before_ts` cursor) when the
-// user scrolls up.
+// PR-C-2 (RFC #2945): canvas was migrated from /activity?type=a2a_receive
+// to /chat-history. Server now returns typed ChatMessage[] in
+// display-ready oldest-first order. These tests guard the canvas-side
+// pagination invariants against the new endpoint surface.
 //
 // Pinned branches:
 //   1. Initial fetch carries `limit=10` and NO before_ts (newest-first
@@ -20,11 +18,10 @@
 //      asserting the rendered bubble count matches the full page).
 //   4. The retry button after a failed initial load uses the same
 //      INITIAL_HISTORY_LIMIT (10), not the legacy 50.
-//
-// IntersectionObserver / scroll-anchor restoration is exercised by the
-// E2E synth-canary suite — pinning it in jsdom would require mocking
-// the observer and faking layout, which is brittler than trusting a
-// live-DOM canary against the staging tenant.
+//   5. before_ts cursor is the OLDEST timestamp from the current page,
+//      passed verbatim to walk backward.
+//   6. Inflight guard rejects duplicate IO triggers while a loadOlder
+//      fetch is in flight.

 import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
 import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
@@ -33,24 +30,31 @@ import React from "react";
 afterEach(cleanup);

 // Both ChatTab sub-panels (MyChat + AgentComms) mount simultaneously so
-// keyboard tab order and aria-controls land on a real DOM. Both fire
-// /activity GETs on mount: MyChat's hits `type=a2a_receive&source=canvas`,
-// AgentComms's hits a different filter. Route the mock by URL so each
-// gets a sensible default and only MyChat's call is what the assertions
-// scrutinise.
-const myChatActivityCalls: string[] = [];
-let myChatNextResponse: { ok: true; rows: unknown[] } | { ok: false; err: Error } = {
-  ok: true,
-  rows: [],
-};
+// keyboard tab order and aria-controls land on a real DOM. MyChat's
+// loadMessagesFromDB hits /chat-history; AgentComms's polling hits a
+// different URL. Route the mock by URL so each gets a sensible default
+// and only MyChat's calls land in the assertion array.
+const myChatHistoryCalls: string[] = [];
+let myChatNextResponse:
+  | { ok: true; messages: unknown[]; reachedEnd?: boolean }
+  | { ok: false; err: Error } = { ok: true, messages: [] };
+
 const apiGet = vi.fn((path: string): Promise<unknown> => {
-  if (path.includes("type=a2a_receive") && path.includes("source=canvas")) {
-    myChatActivityCalls.push(path);
-    if (myChatNextResponse.ok) return Promise.resolve(myChatNextResponse.rows);
+  if (path.includes("/chat-history")) {
+    myChatHistoryCalls.push(path);
+    if (myChatNextResponse.ok) {
+      const reached_end =
+        myChatNextResponse.reachedEnd !== undefined
+          ? myChatNextResponse.reachedEnd
+          : myChatNextResponse.messages.length < 10;
+      return Promise.resolve({
+        messages: myChatNextResponse.messages,
+        reached_end,
+      });
+    }
    return Promise.reject(myChatNextResponse.err);
  }
-  // AgentComms / heartbeat / anything else — empty array is a safe
-  // default that won't blow up the corresponding component's .then().
+  // AgentComms / heartbeat / anything else — empty array safe default.
  return Promise.resolve([]);
 });
 const apiPost = vi.fn();
@@ -84,8 +88,8 @@ const ioInstances: IOInstance[] = [];
 beforeEach(() => {
  apiGet.mockClear();
  apiPost.mockReset();
-  myChatActivityCalls.length = 0;
-  myChatNextResponse = { ok: true, rows: [] };
+  myChatHistoryCalls.length = 0;
+  myChatNextResponse = { ok: true, messages: [] };
  ioInstances.length = 0;
  class FakeIO {
    private inst: IOInstance;
@@ -101,20 +105,12 @@ beforeEach(() => {
      this.inst.disconnected = true;
    }
  }
-  // Install on every reachable global — different bundlers / module
-  // graphs can resolve `IntersectionObserver` via `window`, `globalThis`,
-  // or the bare global. Without all three, jsdom's own (pre-existing)
-  // stub silently wins and ioInstances stays empty.
  (window as unknown as { IntersectionObserver: unknown }).IntersectionObserver = FakeIO;
  (globalThis as unknown as { IntersectionObserver: unknown }).IntersectionObserver = FakeIO;
-  // jsdom doesn't implement scrollIntoView; ChatTab calls it after every
-  // messages update.
  Element.prototype.scrollIntoView = vi.fn();
 });

 function triggerIntersection(instanceIdx = -1) {
-  // -1 → the latest observer (the live one). Tests targeting an old
-  // (disconnected) instance pass a positive index.
  const inst = ioInstances.at(instanceIdx);
  if (!inst) throw new Error(`no IO instance at ${instanceIdx}`);
  inst.callback(
@@ -125,25 +121,30 @@ function triggerIntersection(instanceIdx = -1) {

 import { ChatTab } from "../ChatTab";

-function makeActivityRow(seq: number): Record<string, unknown> {
-  // Zero-pad seq into the minute slot so "seq=10" doesn't produce
-  // the invalid timestamp "00:010:00Z" (caught by the loadOlder URL
-  // assertion below — first version of the helper used `0${seq}` and
-  // the test failed on `before_ts` having an extra digit).
+// makeMessagePair returns a (user, agent) pair sharing a timestamp,
+// matching the wire shape /chat-history emits per activity_logs row.
+// Server-side reverseRowChunks ensures the wire is oldest-first across
+// rows but [user, agent] within each row.
+function makeMessagePair(seq: number): unknown[] {
+  // Zero-pad seq into the minute slot so seq=10 produces a valid
+  // timestamp (00:10:00Z, not 00:010:00Z).
  const mm = String(seq).padStart(2, "0");
-  return {
-    activity_type: "a2a_receive",
-    status: "ok",
-    created_at: `2026-05-05T00:${mm}:00Z`,
-    request_body: { params: { message: { parts: [{ kind: "text", text: `user msg ${seq}` }] } } },
-    response_body: { result: `agent reply ${seq}` },
-  };
+  const ts = `2026-05-05T00:${mm}:00Z`;
+  return [
+    { id: `u-${seq}`, role: "user", content: `user msg ${seq}`, timestamp: ts },
+    { id: `a-${seq}`, role: "agent", content: `agent reply ${seq}`, timestamp: ts },
+  ];
 }

-// Server returns newest-first; the helper builds a server-shape page
-// so the order in the rendered messages array matches production.
-function newestFirstPage(start: number, count: number): unknown[] {
-  return Array.from({ length: count }, (_, i) => makeActivityRow(start + count - 1 - i));
+// pageOldestFirst builds a wire-shape page (oldest-first within page)
+// of `count` row-pairs starting at seq=`start`. Mirrors the server's
+// post-reverseRowChunks emission order.
+function pageOldestFirst(start: number, count: number): unknown[] {
+  const out: unknown[] = [];
+  for (let i = 0; i < count; i++) {
+    out.push(...makeMessagePair(start + i));
+  }
+  return out;
 }

 const minimalData = {
@@ -153,28 +154,30 @@ const minimalData = {
 } as unknown as Parameters<typeof ChatTab>[0]["data"];

 describe("ChatTab lazy history pagination", () => {
-  it("initial fetch carries limit=10 (not the legacy 50)", async () => {
-    myChatNextResponse = { ok: true, rows: [makeActivityRow(1)] };
+  it("initial fetch carries limit=10 (not the legacy 50) and hits /chat-history", async () => {
+    myChatNextResponse = { ok: true, messages: makeMessagePair(1) };
    render(<ChatTab workspaceId="ws-1" data={minimalData} />);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
-    const url = myChatActivityCalls[0];
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
+    const url = myChatHistoryCalls[0];
+    expect(url).toContain("/chat-history");
    expect(url).toContain("limit=10");
    expect(url).not.toContain("limit=50");
    // before_ts should NOT be set on the initial fetch — that's the
    // newest-first slice the user lands on.
    expect(url).not.toContain("before_ts");
+    // /chat-history filters source-canvas server-side; client should
+    // NOT pass type/source params (they belonged to /activity).
+    expect(url).not.toContain("type=a2a_receive");
+    expect(url).not.toContain("source=canvas");
  });

  it("hides the top sentinel when initial fetch returns fewer than the limit", async () => {
    // 3 < 10 → server says "no more older history exists"; sentinel
    // should NOT mount and the "Loading older messages…" line should
-    // never appear (it can't, since the sentinel is what triggers it).
-    myChatNextResponse = {
-      ok: true,
-      rows: [makeActivityRow(1), makeActivityRow(2), makeActivityRow(3)],
-    };
+    // never appear.
+    myChatNextResponse = { ok: true, messages: pageOldestFirst(1, 3) };
    render(<ChatTab workspaceId="ws-2" data={minimalData} />);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
    await waitFor(() => {
      expect(screen.queryByText(/Loading chat history/i)).toBeNull();
    });
@@ -182,15 +185,15 @@ describe("ChatTab lazy history pagination", () => {
  });

  it("renders all messages when initial fetch returns exactly the limit", async () => {
-    // 10 == limit → server might have more older rows; sentinel SHOULD
-    // mount so the IO observer can fire loadOlder() on scroll-up. We
-    // verify by checking the rendered bubble count — if hasMore stayed
-    // true the sentinel render path doesn't crash and all 10 rows
-    // produced their pair of bubbles.
-    const fullPage = Array.from({ length: 10 }, (_, i) => makeActivityRow(i + 1));
-    myChatNextResponse = { ok: true, rows: fullPage };
+    // limit=10 row-pairs → 20 ChatMessages. reachedEnd should be FALSE
+    // so the sentinel mounts. Verified by bubble counts.
+    myChatNextResponse = {
+      ok: true,
+      messages: pageOldestFirst(1, 10),
+      reachedEnd: false,
+    };
    render(<ChatTab workspaceId="ws-3" data={minimalData} />);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
    await waitFor(() => {
      expect(screen.queryByText(/Loading chat history/i)).toBeNull();
    });
@@ -202,54 +205,67 @@ describe("ChatTab lazy history pagination", () => {
    myChatNextResponse = { ok: false, err: new Error("network down") };
    render(<ChatTab workspaceId="ws-4" data={minimalData} />);
    const retry = await screen.findByText(/Retry/);
-    myChatNextResponse = { ok: true, rows: [makeActivityRow(1)] };
+    myChatNextResponse = { ok: true, messages: makeMessagePair(1) };
    fireEvent.click(retry);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
-    const retryUrl = myChatActivityCalls[1];
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
+    const retryUrl = myChatHistoryCalls[1];
+    expect(retryUrl).toContain("/chat-history");
    expect(retryUrl).toContain("limit=10");
    expect(retryUrl).not.toContain("limit=50");
  });

  it("loadOlder fetches limit=20 with before_ts=oldest.timestamp", async () => {
-    // Initial page = 10 rows in newest-first order (seq 10..1). After
-    // the component reverses to oldest-first for display, messages[0]
-    // is built from seq=1 — the oldest — and its timestamp is what
-    // before_ts should carry.
-    myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
+    // Initial page = 10 row-pairs in oldest-first order (seq 1..10).
+    // The oldest (and so the cursor for loadOlder) is seq=1's
+    // timestamp 2026-05-05T00:01:00Z.
+    myChatNextResponse = {
+      ok: true,
+      messages: pageOldestFirst(1, 10),
+      reachedEnd: false,
+    };
    render(<ChatTab workspaceId="ws-load-older" data={minimalData} />);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
    await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));

-    // Stage the older-batch response, then fire the IO callback.
-    myChatNextResponse = { ok: true, rows: newestFirstPage(0, 1) };
+    // Stage older-batch response, then fire IO callback.
+    myChatNextResponse = {
+      ok: true,
+      messages: pageOldestFirst(0, 1),
+      reachedEnd: true,
+    };
    triggerIntersection();

-    await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
-    const olderUrl = myChatActivityCalls[1];
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
+    const olderUrl = myChatHistoryCalls[1];
+    expect(olderUrl).toContain("/chat-history");
    expect(olderUrl).toContain("limit=20");
    expect(olderUrl).toContain("before_ts=");
    expect(decodeURIComponent(olderUrl)).toContain("before_ts=2026-05-05T00:01:00Z");
  });

  it("inflight guard rejects a second IO trigger while first loadOlder is in flight", async () => {
-    myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
+    myChatNextResponse = {
+      ok: true,
+      messages: pageOldestFirst(1, 10),
+      reachedEnd: false,
+    };
    render(<ChatTab workspaceId="ws-inflight" data={minimalData} />);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
    await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));

    // Hold the next loadOlder fetch open with a manual deferred so we
    // can fire the second trigger while the first is in-flight.
-    let release!: (rows: unknown[]) => void;
-    const deferred = new Promise<unknown[]>((res) => {
+    let release!: (resp: unknown) => void;
+    const deferred = new Promise<unknown>((res) => {
      release = res;
    });
    apiGet.mockImplementationOnce((path: string): Promise<unknown> => {
-      myChatActivityCalls.push(path);
+      myChatHistoryCalls.push(path);
      return deferred;
    });

    triggerIntersection(); // start loadOlder #1
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));

    // Second IO trigger lands while #1 is still pending.
    triggerIntersection();
@@ -258,79 +274,62 @@ describe("ChatTab lazy history pagination", () => {
    // Without the inflight guard, each of these would have started a
    // new fetch. With the guard, none of them do — call count stays 2.
    await new Promise((r) => setTimeout(r, 10));
-    expect(myChatActivityCalls.length).toBe(2);
+    expect(myChatHistoryCalls.length).toBe(2);

-    // Release the first fetch. Inflight clears in the finally block;
-    // a subsequent IO trigger is permitted again (verified by checking
-    // we can fire a follow-up after release without hanging the test).
-    release([]);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
+    // Release the first fetch with a valid wire response shape.
+    release({ messages: [], reached_end: true });
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
  });

  it("empty older response clears the scroll anchor and unmounts the sentinel", async () => {
-    // The bug we're pinning: if loadOlder returns 0 rows, the
-    // scrollAnchorRef must be cleared so the next paint doesn't try to
-    // restore against a no-op prepend (which would fight the natural
-    // bottom-pin for any subsequent live message). hasMore flipping to
-    // false is the same flag-flip path; sentinel disappearing is the
-    // observable proxy.
-    myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
+    myChatNextResponse = {
+      ok: true,
+      messages: pageOldestFirst(1, 10),
+      reachedEnd: false,
+    };
    render(<ChatTab workspaceId="ws-anchor" data={minimalData} />);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
    await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));

-    myChatNextResponse = { ok: true, rows: [] }; // empty → reachedEnd
+    myChatNextResponse = {
+      ok: true,
+      messages: [],
+      reachedEnd: true,
+    };
    triggerIntersection();
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));

-    // After reachedEnd the sentinel unmounts (hasMore=false). We can't
-    // peek scrollAnchorRef directly, but we can assert the consequence:
-    // scrollIntoView (the bottom-pin for live appends) is not blocked
-    // by a stale anchor. Trigger a re-render via an unrelated state
-    // change… in practice the safest assertion here is that the
-    // sentinel disappeared (proving the empty response propagated to
-    // hasMore correctly, which is the same flag-flip path as anchor
-    // clearing).
    await waitFor(() => {
      expect(screen.queryByText(/Loading older messages/i)).toBeNull();
    });
  });

  it("IntersectionObserver does not churn when older messages prepend", async () => {
-    // Whole-PR perf invariant: prepending older history (the load-bearing
-    // user gesture) must NOT tear down + re-arm the IO observer.
-    // Triggering loadOlder is the cleanest way to drive a messages
-    // mutation from inside the test, since live agent push goes through
-    // a Zustand store that's harder to drive reliably from jsdom.
-    //
-    // Pre-fix, loadOlder depended on `messages`, so every prepend
-    // recreated loadOlder → re-ran the IO effect → new observer. Each
-    // call to triggerIntersection() produced a fresh disconnected
-    // observer + a new live one. Post-fix, the observer survives.
-    myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
+    myChatNextResponse = {
+      ok: true,
+      messages: pageOldestFirst(1, 10),
+      reachedEnd: false,
+    };
    render(<ChatTab workspaceId="ws-stable-io" data={minimalData} />);
-    await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
+    await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
    await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));

-    // Snapshot the observer instance after first paint stabilises.
    const observerBefore = ioInstances.at(-1);
    expect(observerBefore).toBeDefined();
    expect(observerBefore!.disconnected).toBe(false);

    // Trigger three older-batch prepends. Each batch returns the full
-    // OLDER_HISTORY_BATCH (20 rows) so reachedEnd stays false and the
-    // sentinel keeps mounting. Pre-fix, each prepend mutated `messages`
-    // → recreated loadOlder → re-ran the IO effect → new observer.
+    // OLDER_HISTORY_BATCH (20 row-pairs = 40 messages) so reachedEnd
+    // stays false and the sentinel keeps mounting.
    for (let batch = 0; batch < 3; batch++) {
      myChatNextResponse = {
        ok: true,
-        rows: newestFirstPage(-(batch + 1) * 20, 20),
+        messages: pageOldestFirst(-(batch + 1) * 20, 20),
+        reachedEnd: false,
      };
-      const callsBefore = myChatActivityCalls.length;
+      const callsBefore = myChatHistoryCalls.length;
      triggerIntersection();
-      await waitFor(() =>
-        expect(myChatActivityCalls.length).toBe(callsBefore + 1),
-      );
+      await waitFor(() => expect(myChatHistoryCalls.length).toBe(callsBefore + 1));
    }

    // The original observer is still the live one — no churn.
@@ -7,32 +7,6 @@ export default defineConfig({
  test: {
    environment: 'node',
    exclude: ['e2e/**', 'node_modules/**', '**/dist/**'],
-    // CI-conditional test timeout (issue #96).
-    //
-    // Vitest's 5000ms default is too tight for the first test in any
-    // file under our CI shape: `npx vitest run --coverage` on the
-    // self-hosted Gitea Actions Docker runner. The cold-start cost
-    // (v8 coverage instrumentation init + JSDOM bootstrap + module-
-    // graph import for @/components/* and @/lib/* + first React
-    // render) consistently consumes 5-7 seconds for the first
-    // synchronous test in heavyweight component files
-    // (ActivityTab.test.tsx, CreateWorkspaceDialog.test.tsx,
-    // ConfigTab.provider.test.tsx) — even though every subsequent
-    // test in the same file completes in 100-1500ms.
-    //
-    // Empirically the worst observed first-test was 6453ms in a
-    // single file (CreateWorkspaceDialog). 30000ms gives ~5x
-    // headroom over that on CI; we still keep 5000ms locally so
-    // genuine waitFor races / hung promises stay sensitive in dev.
-    //
-    // Same vitest pattern documented at:
-    //   https://vitest.dev/config/testtimeout
-    //   https://vitest.dev/guide/coverage#profiling-test-performance
-    //
-    // Per-test duration is still emitted to the CI log; if a test
-    // ever silently approaches 25-30s under this raised ceiling that
-    // will surface as a duration regression and we revisit.
-    testTimeout: process.env.CI ? 30000 : 5000,
    // Coverage is instrumented but NOT yet a CI gate — first land
    // observability so we can see the baseline, then dial in
    // thresholds + a hard gate in a follow-up PR (#1815). Today's
@@ -212,8 +212,8 @@ services:
    #   docker compose pull canvas && docker compose up -d canvas
    # First-time local setup or testing unreleased changes — build from source:
    #   docker compose build canvas && docker compose up -d canvas
-    # Note: GHCR images are private — `docker login ghcr.io` required before pull.
-    image: ghcr.io/molecule-ai/canvas:latest
+    # Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull.
+    image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest
    build:
      context: ./canvas
      dockerfile: Dockerfile
@@ -1,74 +0,0 @@
-# ADR-002: Local-build mode signalled by `MOLECULE_IMAGE_REGISTRY` presence
-
-* Status: Accepted (2026-05-07)
-* Issue: #63 (closes Task #194)
-* Decision: Hongming (CTO) + Claude Opus 4.7 (implementation)
-
-## Context
-
-Pre-2026-05-06, every Molecule deployment — both production tenants and OSS contributor laptops — pulled workspace-template-* container images from `ghcr.io/molecule-ai/`. Production tenants additionally set `MOLECULE_IMAGE_REGISTRY` to an AWS ECR mirror via Railway env / EC2 user-data, but the OSS default was the upstream GHCR org.
-
-On 2026-05-06 the `Molecule-AI` GitHub org was suspended (saved memory: `feedback_github_botring_fingerprint`). GHCR now returns **403 Forbidden** for every `molecule-ai/workspace-template-*` manifest. OSS contributors who clone `molecule-core` and run `go run ./workspace-server/cmd/server` cannot provision a workspace — every first provision fails with:
-
-```
-docker image "ghcr.io/molecule-ai/workspace-template-claude-code:latest" not found after pull attempt
-```
-
-Production tenants are unaffected (their `MOLECULE_IMAGE_REGISTRY` points at ECR, which we still control), but OSS onboarding is broken. Workspace template repos are intentionally separate from `molecule-core` (each runtime is OSS-shape and forkable), and they are mirrored to Gitea (`https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>`) — but the provisioner has no path that consumes Gitea source directly.
-
-## Decision
-
-When `MOLECULE_IMAGE_REGISTRY` is **unset** (or empty), the provisioner switches to a **local-build mode** that:
-
-1. Looks up the workspace-template repo's HEAD sha on Gitea via a single API call.
-2. Checks whether a SHA-pinned local image (`molecule-local/workspace-template-<runtime>:<sha12>`) already exists; if so, reuses it.
-3. Otherwise shallow-clones the repo into `~/.cache/molecule/workspace-template-build/<runtime>/<sha12>/` and runs `docker build --platform=linux/amd64 -t <tag> .`.
-4. Hands the SHA-pinned tag to Docker for ContainerCreate, bypassing the registry-pull path entirely.
-
-When `MOLECULE_IMAGE_REGISTRY` is **set**, behavior is unchanged: pull the image from that registry. Existing prod tenants and self-hosters who mirror to a private registry are not affected.
-
-## Consequences
-
-### Positive
-
-* **Zero-config OSS onboarding** — `git clone molecule-core && go run ./workspace-server/cmd/server` boots end-to-end without any registry credentials.
-* **Production tenants protected** — same env var, same semantics in SaaS-mode. Migration is a no-op.
-* **No new env var** — extending an existing var's semantics ("where to pull, OR build locally if absent") rather than introducing `MOLECULE_LOCAL_BUILD=1` keeps the surface small.
-* **SHA-pinned cache** — repeat builds are O(API-call); only template-repo HEAD changes invalidate.
-* **Production-parity image** — amd64 emulation on Apple Silicon honours `feedback_local_must_mimic_production`. The provisioner's existing `defaultImagePlatform()` already forces amd64 for parity; building amd64 locally lets that decision stay consistent.
-
-### Negative
-
-* **Conflates two concerns** — `MOLECULE_IMAGE_REGISTRY` now signals BOTH "where to pull" AND "build locally if absent." A future operator who unsets it expecting a hard error will instead get a slow first-provision. Documented in the runbook.
-* **First-provision is slow on Apple Silicon** — 5–10 min via QEMU emulation on the cold path. Mitigated by SHA-cache (subsequent runs are <1s lookup + 0s build).
-* **Coverage gap** — only 4 of 9 runtimes are mirrored to Gitea today (`claude-code`, `hermes`, `langgraph`, `autogen`). The other 5 fail with an actionable "not mirrored" error. Mirroring those repos is a separate task.
-* **Implicit trust boundary** — operator running `go run` implicitly trusts `molecule-ai/molecule-ai-workspace-template-*` repos on Gitea. This is the same trust they would extend to the GHCR images today; not a new attack surface.
-
-## Alternatives considered
-
-1. **New env var `MOLECULE_LOCAL_BUILD=1`** — explicit, but requires OSS contributors to know it exists. Violates the zero-config goal.
-2. **Push pre-built images to a Gitea container registry, mirror tag from upstream** — operationally cleaner but: (a) Gitea's container-registry add-on isn't deployed on the operator host, (b) defeats the OSS-contributor goal of "hack on the source, see your changes," since they'd still pull a stale image.
-3. **Embed Dockerfiles in molecule-core itself, drop the standalone template repos** — would work but breaks the OSS-shape principle; templates are intentionally separable, anyone-can-fork artifacts.
-4. **Build native arch on Apple Silicon (arm64) and drop the platform pin in local-mode** — fast, but creates `linux/arm64` images that diverge from the amd64-only prod runtime. Local-vs-prod debug behavior would diverge. Rejected per `feedback_local_must_mimic_production`.
-
-## Security review
-
-* **Gitea repo URL allowlist** — runtime name must be in the `knownRuntimes` allowlist (defence-in-depth against a future code path that lets cfg.Runtime carry untrusted input). Repo prefix is hardcoded to `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-`; forks can override via `MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX` (opt-in, default off).
-* **Token handling** — clones are anonymous over HTTPS by default (templates are public). `MOLECULE_GITEA_TOKEN`, if set, is passed via URL userinfo for the clone and as `Authorization: token` for the API call. The token is **masked in every log line** via `maskTokenInURL` / `maskTokenInString` and never appears in the cache dir path.
-* **No silent fallback** — if Gitea is unreachable or the runtime isn't mirrored, we return a clear error mentioning the repo URL and the missing runtime. We **never** fall back to GHCR/ECR (that would be a confusing bug for an OSS contributor who happened to have stale ECR creds in their docker config).
-* **Build-arg injection** — `docker build` is invoked with NO `--build-arg` from external input. Dockerfile is consumed as-is.
-* **Cache poisoning** — cache key is the Gitea HEAD sha + Dockerfile content; a force-push to the template repo's main branch regenerates the key on next run. Cache dir is per-user (`$HOME/.cache`), so cross-user attacks aren't relevant in single-user dev mode.
-
-## Versioning + back-compat
-
-* Existing prod tenants set `MOLECULE_IMAGE_REGISTRY=<ECR url>` → unchanged behavior.
-* Existing local installs that set the var → unchanged behavior.
-* Existing local installs that don't set it → switch to local-build path. Migration: none required (additive); first provision will take 5–10 min instead of failing.
-* No deprecations.
-
-## References
-
-* Issue #63 — feat(workspace-server): local-dev provisioner builds from Gitea source
-* Saved memory `feedback_local_must_mimic_production` — local docker must mimic prod, no bypasses
-* Saved memory `reference_post_suspension_pipeline` — full post-2026-05-06 stack shape
-* Saved memory `feedback_github_botring_fingerprint` — what got the org suspended
@@ -4,7 +4,7 @@ How a workspace-server code change reaches the prod tenant fleet — and how to

 > **⚠️ State note (2026-04-22):** this doc describes the **intended design**. As of this write, the canary fleet described below is **not actually running** — no canary tenants are provisioned, `CANARY_TENANT_URLS` / `CANARY_ADMIN_TOKENS` / `CANARY_CP_SHARED_SECRET` are empty in repo secrets, and `canary-verify.yml` fails every run.
 >
-> Current merges gate on manual `promote-latest.yml` dispatches, not canary. See [molecule-controlplane/docs/canary-tenants.md](https://github.com/Molecule-AI/molecule-controlplane/blob/main/docs/canary-tenants.md) for the Phase 1 code work that's already shipped + the Phase 2 plan for actually standing up the fleet + a "should we even do this now?" decision framework.
+> Current merges gate on manual `promote-latest.yml` dispatches, not canary. See [molecule-controlplane/docs/canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/docs/canary-tenants.md) for the Phase 1 code work that's already shipped + the Phase 2 plan for actually standing up the fleet + a "should we even do this now?" decision framework.
 >
 > **Account-specific identifiers (AWS account ID, IAM role name) referenced below in the original design have been redacted from this public doc.** The actual values — if they exist — are in `Molecule-AI/internal/runbooks/canary-fleet.md`. If you're implementing Phase 2, start there.
 >
@@ -1,7 +1,7 @@
 # Molecule AI — Comprehensive Technical Documentation

 > Definitive technical reference for the Molecule AI Agent Team platform.
-> Based on a full non-invasive scan of the [molecule-monorepo](https://github.com/Molecule-AI/molecule-monorepo) repository.
+> Based on a full non-invasive scan of the [molecule-monorepo](https://git.moleculesai.app/molecule-ai/molecule-monorepo) repository.

 ---

@@ -1149,11 +1149,11 @@ Molecule AI's workspace abstraction is **runtime-agnostic by design**. A workspa

 ## Links

- **GitHub**: https://github.com/Molecule-AI/molecule-monorepo
- **Architecture Docs**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/architecture
- **API Protocol**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/api-protocol
- **Agent Runtime**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/agent-runtime
- **Product Docs**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/product
+- **GitHub**: https://git.moleculesai.app/molecule-ai/molecule-monorepo
+- **Architecture Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/architecture
+- **API Protocol**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/api-protocol
+- **Agent Runtime**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/agent-runtime
+- **Product Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/product

 ---

@@ -79,7 +79,7 @@ For SOC2 / ISO 27001 / customer security questionnaires:

 ## Pointers

- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/kms.go)
- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/aes.go)
+- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/internal/crypto/kms.go)
+- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/internal/crypto/aes.go)
 - Tenant secrets handler: [`workspace-server/internal/crypto/aes.go`](../../workspace-server/internal/crypto/aes.go)
 - Tenant secrets schema: [database-schema.md](./database-schema.md#workspace_secrets)
@@ -0,0 +1,28 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64">
+  <style>
+    .bg { fill: #0a1120; }
+    .accent { fill: #7fe8d6; }
+    .accent-stroke { stroke: #7fe8d6; }
+    @media (prefers-color-scheme: light) {
+      .bg { fill: #f5f7fa; }
+      .accent { fill: #1a8a72; }
+      .accent-stroke { stroke: #1a8a72; }
+    }
+  </style>
+  <rect class="bg" width="64" height="64" rx="14"/>
+  <g class="accent-stroke" stroke-width="2.4" stroke-linecap="round" fill="none">
+    <line x1="32" y1="32" x2="12" y2="14"/>
+    <line x1="32" y1="32" x2="52" y2="18"/>
+    <line x1="32" y1="32" x2="10" y2="40"/>
+    <line x1="32" y1="32" x2="54" y2="44"/>
+    <line x1="32" y1="32" x2="32" y2="56"/>
+  </g>
+  <g class="accent">
+    <circle cx="32" cy="32" r="6.5"/>
+    <circle cx="12" cy="14" r="3.5"/>
+    <circle cx="52" cy="18" r="3.5"/>
+    <circle cx="10" cy="40" r="3.5"/>
+    <circle cx="54" cy="44" r="3.5"/>
+    <circle cx="32" cy="56" r="3.5"/>
+  </g>
+</svg>
@@ -0,0 +1,17 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" role="img" aria-label="Molecule AI">
+  <g stroke="#7fe8d6" stroke-width="2.6" stroke-linecap="round" fill="none">
+    <line x1="32" y1="32" x2="12" y2="14"/>
+    <line x1="32" y1="32" x2="52" y2="18"/>
+    <line x1="32" y1="32" x2="10" y2="40"/>
+    <line x1="32" y1="32" x2="54" y2="44"/>
+    <line x1="32" y1="32" x2="32" y2="56"/>
+  </g>
+  <g fill="#7fe8d6">
+    <circle cx="32" cy="32" r="7"/>
+    <circle cx="12" cy="14" r="3.6"/>
+    <circle cx="52" cy="18" r="3.6"/>
+    <circle cx="10" cy="40" r="3.6"/>
+    <circle cx="54" cy="44" r="3.6"/>
+    <circle cx="32" cy="56" r="3.6"/>
+  </g>
+</svg>
@@ -299,8 +299,8 @@ Or use the Canvas UI: Workspace → Config → MCP Servers → Add browser MCP s

 **Try it free** — Molecule AI is open source and self-hostable. Get a workspace running in under 5 minutes.

-→ [Get started on GitHub →](https://github.com/Molecule-AI/molecule-core)
+→ [Get started on GitHub →](https://git.moleculesai.app/molecule-ai/molecule-core)

 ---

-*Have a browser automation use case you want to see covered? Open a discussion on [GitHub Discussions](https://github.com/Molecule-AI/molecule-core/discussions) — or file an issue with the `enhancement` label.*
+*Have a browser automation use case you want to see covered? File an issue with the `enhancement` label on the [molecule-core issue tracker](https://git.moleculesai.app/molecule-ai/molecule-core/issues).*
@@ -148,7 +148,7 @@ Then follow the [quick-start guide](/docs/guides/remote-workspaces.md).
 Or run the annotated example directly:

 ```bash
-git clone https://github.com/Molecule-AI/molecule-sdk-python
+git clone https://git.moleculesai.app/molecule-ai/molecule-sdk-python
 cd molecule-sdk-python/examples/remote-agent
 # Create workspace with runtime:external, grab the ID, then:
 WORKSPACE_ID=<your-id> PLATFORM_URL=https://acme.moleculesai.app python3 run.py
@@ -160,6 +160,6 @@ The agent appears on the canvas within seconds.

 → [Remote Workspaces Guide →](/docs/guides/remote-workspaces.md)
 → [External Agent Registration Reference →](/docs/guides/external-agent-registration.md)
-→ [molecule-sdk-python →](https://github.com/Molecule-AI/molecule-sdk-python)
+→ [molecule-sdk-python →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python)

 *Phase 30 shipped in PRs #1075–#1083 and #1085–#1100 on `molecule-core`.*
@@ -133,4 +133,4 @@ With protocol-native A2A, you get:

 Molecule AI's external agent registration is production-ready. Documentation is live at [External Agent Registration Guide](https://docs.molecule.ai/docs/guides/external-agent-registration). The npm package for the MCP server is available at [`@molecule-ai/mcp-server`](https://www.npmjs.com/package/@molecule-ai/mcp-server).

-Read the full [A2A v1.0 protocol spec](https://github.com/Molecule-AI/molecule-core/blob/main/docs/api-protocol/a2a-protocol.md) on GitHub.
+Read the full [A2A v1.0 protocol spec](https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/api-protocol/a2a-protocol.md) on GitHub.
@@ -45,7 +45,7 @@ canonicalUrl: "https://docs.molecule.ai/blog/remote-workspaces"
  " proficiencyLevel": "Expert",
  "genre": ["technical documentation", "product announcement"],
  "sameAs": [
-    "https://github.com/Molecule-AI/molecule-core",
+    "https://git.moleculesai.app/molecule-ai/molecule-core",
    "https://molecule.ai"
  ]
 }
@@ -270,7 +270,7 @@ Configure it in your project's `.mcp.json` and any AI agent (Claude Code, Cursor

 → [External Agent Registration Guide](/docs/guides/external-agent-registration) — full step-by-step with Python and Node.js reference implementations

-→ [GitHub: molecule-core](https://github.com/Molecule-AI/molecule-core) — source and issues
+→ [GitHub: molecule-core](https://git.moleculesai.app/molecule-ai/molecule-core) — source and issues

 → [Phase 30 Launch Thread on X](https://x.com) — follow for updates

@@ -170,4 +170,4 @@ The `staging` branch is now on `a2a-sdk` 1.0.0. The `main` branch still carries

 If you're running `a2a-sdk` 0.3.x and planning the 1.0.0 migration, this post is the reference. The four breaking changes are well-contained, the migration is a single PR, and the eight smoke scenarios above will tell you whether the upgrade is clean before you merge.

-Questions? The [A2A protocol spec](https://github.com/google-a2a/a2a-specification) is the authoritative source. For Molecule AI's production A2A implementation, see [External Agent Registration](https://docs.molecule.ai/docs/guides/external-agent-registration) or open an issue in the [molecule-core](https://github.com/Molecule-AI/molecule-core) repo.
+Questions? The [A2A protocol spec](https://github.com/google-a2a/a2a-specification) is the authoritative source. For Molecule AI's production A2A implementation, see [External Agent Registration](https://docs.molecule.ai/docs/guides/external-agent-registration) or open an issue in the [molecule-core](https://git.moleculesai.app/molecule-ai/molecule-core) repo.
@@ -1,41 +1,5 @@
 # Local Development

-## Workspace Template Images: Local-Build Mode (Issue #63)
-
-OSS contributors who run `molecule-core` locally do **not** need to authenticate to GHCR or AWS ECR. When the `MOLECULE_IMAGE_REGISTRY` env var is **unset**, the platform automatically:
-
-1. Looks up the HEAD sha of `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>` (single API call, no clone).
-2. If a local image tagged `molecule-local/workspace-template-<runtime>:<sha12>` already exists, reuses it (cache hit).
-3. Otherwise, shallow-clones the repo into `~/.cache/molecule/workspace-template-build/<runtime>/<sha12>/` and runs `docker build --platform=linux/amd64 -t <tag> .`.
-4. Hands the SHA-pinned tag to Docker for `ContainerCreate`.
-
-**First-provision build time:** 5–10 min on Apple Silicon (amd64 emulation). Subsequent provisions hit the cache and start in seconds. Cache is invalidated automatically when the template repo's HEAD moves.
-
-**Currently mirrored on Gitea:** `claude-code`, `hermes`, `langgraph`, `autogen`. Other runtimes (`crewai`, `deepagents`, `codex`, `gemini-cli`, `openclaw`) fail with an actionable "not mirrored to Gitea" error pointing at the missing repo.
-
-**Production tenants are unaffected** — every prod tenant sets `MOLECULE_IMAGE_REGISTRY` to its private ECR mirror via Railway env / EC2 user-data, so the SaaS pull path stays identical.
-
-### Environment overrides
-
-| Var | Default | Use case |
-|-----|---------|----------|
-| `MOLECULE_IMAGE_REGISTRY` | (unset) | Set to a real registry URL to switch from local-build to SaaS-pull mode. |
-| `MOLECULE_LOCAL_BUILD_CACHE` | `~/.cache/molecule/workspace-template-build` | Override cache directory. |
-| `MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX` | `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-` | Point at a fork. |
-| `MOLECULE_GITEA_TOKEN` | (unset) | Required only if your fork has private template repos. |
-
-### Verifying a switch from the GHCR-retag stopgap
-
-Pre-fix, OSS contributors worked around the suspended GHCR org by manually retagging an `:latest` image. After this change, that workaround is **redundant**: simply unset `MOLECULE_IMAGE_REGISTRY` (or leave it unset), boot the platform, and provision a workspace. Logs will show:
-
-```
-Provisioner: local-build mode → using locally-built image molecule-local/workspace-template-claude-code:<sha12> for runtime claude-code
-local-build: cloning https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-claude-code → ...
-local-build: docker build done in <duration>
-```
-
-If you still see `ghcr.io/molecule-ai/...` in the boot log, double-check `env | grep MOLECULE_IMAGE_REGISTRY` — a stale shell export from the pre-fix workaround could keep SaaS-mode active.
-
 ## Starting the Stack

 ```bash
@@ -1,147 +0,0 @@
-# Rate-limit observability runbook
-
-> Companion to issue #64 ("RATE_LIMIT default re-tune analysis"). After
-> #60 deployed the per-tenant `keyFor` keying, the right RATE_LIMIT
-> default became data-dependent. This runbook documents the metrics +
-> queries an operator should run to confirm whether the current 600
-> req/min/key default is correct, too tight, or too loose.
-
-## What's already exposed
-
-The workspace-server's existing Prometheus middleware
-(`workspace-server/internal/metrics/metrics.go`) tracks every request
-on every path:
-
-```
-molecule_http_requests_total{method, path, status}      counter
-molecule_http_request_duration_seconds_total{method,path,status}  counter
-```
-
-Path is the matched route pattern (`/workspaces/:id/activity` etc), so
-high-cardinality workspace UUIDs do not explode the label space.
-
-The rate limiter middleware (#60, `workspace-server/internal/middleware/ratelimit.go`)
-also stamps every response with `X-RateLimit-Limit`, `X-RateLimit-Remaining`,
-and `X-RateLimit-Reset`. Operators with browser-side or proxy-side
-header capture can read per-request bucket state directly.
-
-No new instrumentation is needed for #64's acceptance criteria. The
-metric surface is sufficient — this runbook just collects the queries.
-
-## Queries to run after #60 deploys
-
-### 1. Is the bucket actually firing 429s?
-
-```promql
-sum(rate(molecule_http_requests_total{status="429"}[5m]))
-```
-
-If this is zero on a given tenant, the bucket isn't being hit. If it's
-sustained > 1/min, dig in.
-
-### 2. Which routes attract 429s?
-
-```promql
-topk(
-  10,
-  sum by (path) (
-    rate(molecule_http_requests_total{status="429"}[5m])
-  )
-)
-```
-
-Expected shape post-#60:
- `/workspaces/:id/activity` should be near zero — the canvas no longer
-  polls it on a 30s/60s/5s cadence (PRs #69 / #71 / #76).
- Probe / health / heartbeat paths should be ~0 (those routes have a
-  separate IP-fallback bucket).
-
-If `/workspaces/:id/activity` 429s persist post-PRs-69/71/76 deploy, the
-canvas isn't running the WS-subscriber path — investigate WS health
-on that tenant.
-
-### 3. Per-bucket-key inference (no direct exposure today)
-
-The bucket map itself is in-memory only; we deliberately do **not**
-expose `org:<uuid>` ↔ remaining-tokens because that map can include
-SHA-256 hashes of bearer tokens. A tenant that wants per-key visibility
-should rely on response headers (`X-RateLimit-Remaining` on every
-response from a given session is the bucket's view of that session).
-
-If you genuinely need server-side per-bucket counts for triage,
-file a follow-up — the proper shape is a `/internal/ratelimit-stats`
-endpoint that emits **counts per key prefix only** (e.g. `org:`, `tok:`,
-`ip:`), never the key payloads. Don't roll that ad-hoc; it's a security
-review surface.
-
-## Decision tree for the re-tune
-
-After 14 days of production traffic on a tenant, look at the queries
-above and walk this tree:
-
-```
-Q1: Is the 429 rate sustained > 0.1/sec on any tenant?
-  ├─ NO  → The 600 default has comfortable headroom. Either keep it,
-  │        or lower it carefully (300) ONLY if you have a documented
-  │        reason (e.g. a misbehaving client we want to throttle harder).
-  │        Default to "no change" — see #64 for the math.
-  └─ YES → Q2.
-
-Q2: Is the 429 rate concentrated on ONE tenant or spread across many?
-  ├─ ONE tenant → Operator override: set RATE_LIMIT=1200 or 1800 on that
-  │               tenant's box. Document in the tenant's ops note. The
-  │               default does not need to change.
-  └─ MANY tenants → Q3.
-
-Q3: Are the 429s on a route that polls (e.g. /activity / /peers)?
-  ├─ YES → Confirm PRs #69, #71, #76 have actually deployed to those
-  │         tenants. If they have and 429s persist, the canvas may have
-  │         a regression — do not raise RATE_LIMIT. File a canvas issue.
-  └─ NO  → 429s on mutating routes mean genuine load. Raise the default
-            to 1200 in `workspace-server/internal/router/router.go:54`.
-            Same PR should attach: the metric chart, the time window,
-            and a paragraph explaining what changed in our traffic shape.
-```
-
-## Alert rule template (drop-in for Prometheus)
-
-```yaml
-# Sustained 429s — file is the SLO trip-wire. If this fires, walk the
-# decision tree above. NB: the issue#64 acceptance criterion is "two
-# weeks of metrics"; this alert is the inverse — it tells you something
-# changed before the two weeks are up.
-groups:
-  - name: workspace-server-ratelimit
-    rules:
-      - alert: WorkspaceServerRateLimit429Sustained
-        expr: |
-          sum by (instance) (
-            rate(molecule_http_requests_total{status="429"}[10m])
-          ) > 0.1
-        for: 30m
-        labels:
-          severity: warning
-          owner: workspace-server
-        annotations:
-          summary: "{{ $labels.instance }} sustained 429s — see ratelimit-observability runbook"
-          runbook: "https://git.moleculesai.app/molecule-ai/molecule-core/blob/main/docs/engineering/ratelimit-observability.md"
-```
-
-Threshold rationale: 0.1 req/s = 6/min sustained over 10min. Below
-that, a 429 is almost certainly a transient burst that the canvas's
-retry-once handler at `canvas/src/lib/api.ts:55` already absorbs. The
-30m `for:` keeps the alert from chattering on a brief blip.
-
-## Companion probe script
-
-For one-off triage when an operator can reproduce the problem in their
-own browser, `scripts/edge-429-probe.sh` (#62) reproduces a canvas-
-sized burst against a tenant subdomain and dumps each 429's response
-shape so the operator can distinguish workspace-server bucket overflow
-from CF/Vercel edge rate-limiting without dashboard access.
-
-```sh
-./scripts/edge-429-probe.sh hongming.moleculesai.app --burst 80 --out /tmp/edge.txt
-```
-
-The script's report header explains how to read the output.
@@ -215,7 +215,7 @@ Push mode (this guide) works today but requires an inbound-reachable URL — whi

 Your agent makes only outbound HTTPS calls to the platform, pulling messages from an inbox queue and posting replies back. Works behind any NAT/firewall, tolerates offline laptops, no tunnel needed.

-See the [design doc](https://github.com/Molecule-AI/internal/blob/main/product/external-workspaces-polling.md) (internal) and [implementation tracking issue](https://github.com/Molecule-AI/molecule-core/issues?q=polling+mode) once opened.
+See the [design doc](https://git.moleculesai.app/molecule-ai/internal/src/branch/main/product/external-workspaces-polling.md) (internal) and the implementation tracking issue (search `polling+mode` on the [molecule-core issue tracker](https://git.moleculesai.app/molecule-ai/molecule-core/issues)).

 ---

@@ -143,5 +143,5 @@ The agent appears on the canvas with a **purple REMOTE badge** within seconds. F
 ## Next Steps

 - **[External Agent Registration Guide →](/docs/guides/external-agent-registration)** — full endpoint reference, Python + Node.js examples, troubleshooting
- **[molecule-sdk-python →](https://github.com/Molecule-AI/molecule-sdk-python)** — SDK source, `RemoteAgentClient` API docs
- **[SDK Examples →](https://github.com/Molecule-AI/molecule-sdk-python/tree/main/examples/remote-agent)** — `run.py` demo script, annotated walkthrough
+- **[molecule-sdk-python →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python)** — SDK source, `RemoteAgentClient` API docs
+- **[SDK Examples →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python/src/branch/main/examples/remote-agent)** — `run.py` demo script, annotated walkthrough
@@ -61,7 +61,7 @@ molecule skills install arxiv-research --from community

 Community skills are reviewed by the Molecule AI team before being
 listed. Submit a skill for review by opening a PR against
-[`molecule-ai/skills`](https://github.com/Molecule-AI/skills).
+[`molecule-ai/skills`](https://git.moleculesai.app/molecule-ai/skills).

 ## Installing via config.yaml

@@ -151,7 +151,7 @@ molecule skills bundle my-custom-skill --output ./org-templates/my-role/
 ```

 **Publishing to the community:** Open a PR against
-[`molecule-ai/skills`](https://github.com/Molecule-AI/skills) with a
+[`molecule-ai/skills`](https://git.moleculesai.app/molecule-ai/skills) with a
 complete skill package. Community skills are reviewed for security and
 correctness before listing.

@@ -58,11 +58,8 @@ green — proves wire shape end-to-end against a real `hermes gateway run`
 subprocess + stub OpenAI-compat LLM. Caught + fixed a real `KeyError`
 in upstream `hermes_cli/tools_config.py` (PLATFORMS dict lookup
 crashed on plugin platforms) — fix on the patched fork branch
-(`molecule-ai/hermes-agent` `feat/platform-adapter-plugins`, commit
-`18e4849e`, hosted on Gitea at
-`https://git.moleculesai.app/molecule-ai/hermes-agent` — moved from the
-suspended `github.com/HongmingWang-Rabbit/hermes-agent`, see
-`molecule-ai/internal#72`). Upstream PR #18775 OPEN; CONFLICTING with main.
+(`HongmingWang-Rabbit/hermes-agent` `feat/platform-adapter-plugins`,
+commit `18e4849e`). Upstream PR #18775 OPEN; CONFLICTING with main.
 Not on critical path for our platform — patched fork is what the
 workspace image installs.

@@ -99,7 +96,7 @@ fork needed in production.
  `resolve_platform_id` for plugin-platform-safe deserialization, and
  `self.adapters[adapter.platform]` keying fix (caught by real-subprocess
  test before merge — see below).
- **Plugin package**: [Molecule-AI/hermes-platform-molecule-a2a](https://github.com/Molecule-AI/hermes-platform-molecule-a2a)
+- **Plugin package**: [Molecule-AI/hermes-platform-molecule-a2a](https://git.moleculesai.app/molecule-ai/hermes-platform-molecule-a2a)
  v0.1.0 — public, MIT-licensed. 11 unit tests + 8 in-process E2E
  + 4 real-subprocess E2E checkpoints all green.
 - **Workspace template patch**: [Molecule-AI/molecule-ai-workspace-template-hermes#32](https://github.com/Molecule-AI/molecule-ai-workspace-template-hermes/pull/32)
@@ -157,7 +154,7 @@ intermediate shim earns its complexity.
 ## Codex (OpenAI Codex CLI)

 **Status:** Template SHIPPED. Repo live at
-[`Molecule-AI/molecule-ai-workspace-template-codex`](https://github.com/Molecule-AI/molecule-ai-workspace-template-codex)
+[`Molecule-AI/molecule-ai-workspace-template-codex`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-codex)
 (14 files, 1411 LOC, 12/12 tests). molecule-core registration in
 [PR #2512](https://github.com/Molecule-AI/molecule-core/pull/2512).
 E2E with real A2A traffic remains.
@@ -17,7 +17,7 @@ This path is aligned to the current repository and current UI. It gets you from
 ## The one-command path

 ```bash
-git clone https://github.com/Molecule-AI/molecule-monorepo.git
+git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
 cd molecule-monorepo
 ./scripts/dev-start.sh
 ```
@@ -42,7 +42,7 @@ If you'd rather run each component yourself — useful when you're iterating on
 ### Step 1: Clone the repository

 ```bash
-git clone https://github.com/Molecule-AI/molecule-monorepo.git
+git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
 cd molecule-monorepo
 ```

@@ -1,137 +0,0 @@
-# Runbook — Handlers Postgres Integration port-collision substrate
-
-**Status:** Resolved 2026-05-08 (PR for class B Hongming-owned CICD red sweep).
-
-## Symptom
-
-`Handlers Postgres Integration` workflow fails on staging push and PRs.
-Step `Apply migrations to Postgres service` shows:
-
-```
-psql: error: connection to server at "127.0.0.1", port 5432 failed: Connection refused
-```
-
-Job-cleanup step further down logs:
-
-```
-Cleaning up services for job Handlers Postgres Integration
-failed to remove container: Error response from daemon: No such container: <id>
-```
-
-…confirming the postgres service container was already gone before
-cleanup ran.
-
-## Root cause
-
-Our Gitea act_runner (operator host `5.78.80.188`,
-`/opt/molecule/runners/config.yaml`) sets:
-
-```yaml
-container:
-  network: host
-```
-
-…which act_runner applies to BOTH the job container AND every
-`services:` container in a workflow. Multiple workflow instances
-running concurrently across the 16 parallel runners each try to bind
-postgres on `0.0.0.0:5432`. The first wins; subsequent instances exit
-immediately with:
-
-```
-LOG:  could not bind IPv4 address "0.0.0.0": Address in use
-HINT: Is another postmaster already running on port 5432?
-FATAL: could not create any TCP/IP sockets
-```
-
-act_runner sets `AutoRemove:true` on service containers, so Docker
-garbage-collects them as soon as they exit. By the time the migrations
-step runs `pg_isready` / `psql`, the container is gone and connection
-refused.
-
-Reproduction (operator host):
-
-```bash
-docker run --rm -d --name pg-A --network host \
-  -e POSTGRES_PASSWORD=test postgres:15-alpine
-docker run -d --name pg-B --network host \
-  -e POSTGRES_PASSWORD=test postgres:15-alpine
-docker logs pg-B   # FATAL: could not create any TCP/IP sockets
-```
-
-## Why per-job override doesn't work
-
-The natural fix — per-job `container.network` override — is silently
-ignored by act_runner. The runner log emits:
-
-```
--network and --net in the options will be ignored.
-```
-
-This is a documented act_runner constraint: container network is a
-runner-wide setting, not per-job. Source: gitea/act_runner config docs
-+ vegardit/docker-gitea-act-runner issue #7.
-
-Flipping the global `container.network` to `bridge` would break every
-other workflow in the repo (cache server discovery,
-`molecule-monorepo-net` peer access during integration tests, etc.) —
-unacceptable blast radius for a per-test bug.
-
-## Fix shape
-
-`handlers-postgres-integration.yml` no longer uses `services: postgres:`.
-It launches a sibling postgres container manually on the existing
-`molecule-monorepo-net` bridge network with a per-run unique name:
-
-```yaml
-env:
-  PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
-  PG_NETWORK: molecule-monorepo-net
-
-steps:
-  - name: Start sibling Postgres on bridge network
-    run: |
-      docker run -d --name "${PG_NAME}" --network "${PG_NETWORK}" \
-        ...
-        postgres:15-alpine
-      PG_HOST=$(docker inspect "${PG_NAME}" \
-        --format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
-      echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
-
-  # … migrations + tests use ${PG_HOST}, not 127.0.0.1 …
-
-  - if: always() && …
-    name: Stop sibling Postgres
-    run: docker rm -f "${PG_NAME}" || true
-```
-
-The host-net job container can reach a bridge-net container via the
-bridge IP directly (verified manually, 2026-05-08). Two parallel runs
-use different names + different bridge IPs — no collision.
-
-## Future-proofing
-
-Other workflows that hit the same shape (any `services:` with a
-fixed-port image) will exhibit the same failure mode under
-host-network runner config. Translate using this same pattern:
-
-1. Drop the `services:` block.
-2. Use `${{ github.run_id }}-${{ github.run_attempt }}` for unique
-   container name.
-3. Launch on `molecule-monorepo-net` (already trusted bridge in
-   `docker-compose.infra.yml`).
-4. Read back the bridge IP via `docker inspect` and export as a step env.
-5. `if: always()` cleanup step at the end.
-
-If the count of such workflows grows, factor into a composite action
-(`./.github/actions/sibling-postgres`) so the substrate logic lives
-in one place.
-
-## Related
-
- Issue #88 (closed by #92): localhost → 127.0.0.1 fix that unmasked
-  this collision; the IPv6 fix is correct, port collision is the new
-  layer.
- Issue #94 created `molecule-monorepo-net` + `alpine:latest` as
-  prereqs.
- Saved memory `feedback_act_runner_github_server_url` documents
-  another act_runner-vs-GHA divergence (server URL).
@@ -98,14 +98,14 @@ Each of the 8 adapter template repos contains:

 | Adapter | Repo |
 |---------|------|
-| claude-code | https://github.com/Molecule-AI/molecule-ai-workspace-template-claude-code |
-| langgraph | https://github.com/Molecule-AI/molecule-ai-workspace-template-langgraph |
-| crewai | https://github.com/Molecule-AI/molecule-ai-workspace-template-crewai |
-| autogen | https://github.com/Molecule-AI/molecule-ai-workspace-template-autogen |
-| deepagents | https://github.com/Molecule-AI/molecule-ai-workspace-template-deepagents |
-| hermes | https://github.com/Molecule-AI/molecule-ai-workspace-template-hermes |
-| gemini-cli | https://github.com/Molecule-AI/molecule-ai-workspace-template-gemini-cli |
-| openclaw | https://github.com/Molecule-AI/molecule-ai-workspace-template-openclaw |
+| claude-code | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-claude-code |
+| langgraph | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-langgraph |
+| crewai | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-crewai |
+| autogen | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-autogen |
+| deepagents | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-deepagents |
+| hermes | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-hermes |
+| gemini-cli | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-gemini-cli |
+| openclaw | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-openclaw |

 ## Adapter discovery (ADAPTER_MODULE)

@@ -244,7 +244,7 @@ correctness before pushing a `runtime-v*` tag.
 ## Writing a new adapter

 Use the GitHub template repo
-[`Molecule-AI/molecule-ai-workspace-template-starter`](https://github.com/Molecule-AI/molecule-ai-workspace-template-starter)
+[`molecule-ai/molecule-ai-workspace-template-starter`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter) (note: the starter repo did not survive the 2026-05-06 GitHub-org-suspension migration; recreation tracked at internal#41)
 — it ships with the canonical Dockerfile + adapter.py skeleton + config.yaml
 schema + the `repository_dispatch: [runtime-published]` cascade receiver
 already wired up. No follow-up setup PR required.
@@ -256,7 +256,7 @@ gh repo create Molecule-AI/molecule-ai-workspace-template-<runtime> \
  --public \
  --description "Molecule AI workspace template: <runtime>"

-git clone https://github.com/Molecule-AI/molecule-ai-workspace-template-<runtime>
+git clone https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>.git
 cd molecule-ai-workspace-template-<runtime>
 ```

@@ -286,7 +286,7 @@ After `git push`:
 If the canonical shape changes (e.g. `config.yaml` schema gets a new field,
 the `BaseAdapter` interface adds a method, the reusable CI workflow
 signature changes), update the
-[starter](https://github.com/Molecule-AI/molecule-ai-workspace-template-starter)
+[starter](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter) (recreation pending — see note above)
 **first**. Existing templates can either migrate at their own pace or be
 touched in a coordinated cleanup PR. Either way, future templates pick up
 the new shape from day one.
@@ -41,7 +41,6 @@
    {"name": "medo-smoke", "repo": "Molecule-AI/molecule-ai-org-template-medo-smoke", "ref": "main"},
    {"name": "molecule-worker-gemini", "repo": "Molecule-AI/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"},
-    {"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"},
-    {"name": "mock-bigorg", "repo": "Molecule-AI/molecule-ai-org-template-mock-bigorg", "ref": "main"}
+    {"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
  ]
 }
@@ -11,7 +11,7 @@ There are three related scripts; pick the right one:
 |---|---|---|
 | `measure-coordinator-task-bounds.sh` | **Canonical** v1 harness for the RFC #2251 / Issue 4 reproduction. Provisions a PM coordinator + Researcher child via `claude-code-default` + `langgraph` templates, sends a synthesis-heavy A2A kickoff, observes elapsed time + activity trace. | OSS-shape platform — localhost or any `/workspaces`-shaped endpoint. Has tenant/admin-token guards for non-localhost runs. |
 | `measure-coordinator-task-bounds-runner.sh` | Generalised runner for the same measurement contract but with **arbitrary template + secret + model combinations** (Hermes/MiniMax, etc.). Useful for cross-runtime variants without modifying the canonical harness. | Same as above (local or SaaS via `MODE=saas`). |
-| `measure-coordinator-task-bounds.sh` (in [molecule-controlplane](https://github.com/Molecule-AI/molecule-controlplane)) | **Production-shape** variant that bootstraps a real staging tenant via `POST /cp/admin/orgs`, then runs the same measurement against `<slug>.staging.moleculesai.app`. | Staging controlplane only — refuses to run against production. |
+| `measure-coordinator-task-bounds.sh` (in [molecule-controlplane](https://git.moleculesai.app/molecule-ai/molecule-controlplane)) | **Production-shape** variant that bootstraps a real staging tenant via `POST /cp/admin/orgs`, then runs the same measurement against `<slug>.staging.moleculesai.app`. | Staging controlplane only — refuses to run against production. |

 See `reference_harness_pair_pattern` (auto-memory) for when to use which
 and the cross-repo design rationale.
@@ -278,7 +278,7 @@ include = ["molecule_runtime*"]
 README_TEMPLATE = """\
 # molecule-ai-workspace-runtime

-Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core)
+Shared workspace runtime for [Molecule AI](https://git.moleculesai.app/molecule-ai/molecule-core)
 agent adapters. Installed by every workspace template image
 (`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide
 A2A delegation, heartbeat, memory, plugin loading, and skill management.
@@ -396,7 +396,7 @@ If you don't need real-time push, the default poll path works
 universally with no extra setup; both modes converge on the same
 `inbox_pop` ack so messages never duplicate.

-See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
+See [`docs/workspace-runtime-package.md`](https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/workspace-runtime-package.md)
 for the publish flow and architecture.
 """

@@ -17,23 +17,12 @@
 #
 # Used by .github/workflows/auto-promote-stale-alarm.yml. Logic lives
 # here (not inline in the workflow YAML) so we can:
-#   - Unit-test it with a fixture (see test-check-stale-promote-pr.sh)
+#   - Unit-test it with a stubbed `gh` (see test-check-stale-promote-pr.sh)
 #   - Run it ad-hoc by an operator: `scripts/check-stale-promote-pr.sh`
 #   - Reuse the same surface in any sibling workflow that needs the same
 #     check (SSOT — one detector, many callers).
 #
-# Requires: `curl`, `jq`. `GITEA_TOKEN` (or `GITHUB_TOKEN` / `GH_TOKEN`
-# for back-compat) in the workflow context. Reads `GITHUB_SERVER_URL`
-# / `GITEA_API_URL` for the Gitea base, defaulting to
-# https://git.moleculesai.app/api/v1.
-#
-# Post-2026-05-06 (Gitea migration, issue #75): the previous version
-# called `gh pr list/view/comment`, all of which hit GitHub.com's
-# GraphQL or /api/v3 REST shapes. Gitea exposes /api/v1/ only (no
-# GraphQL → 405, no /api/v3 → 404). So this script now talks to the
-# Gitea v1 API directly via curl. The fixture-driven unit tests are
-# unchanged — they bypass the live fetch via PR_FIXTURE and still pass
-# the historical (GitHub-shape) JSON which `detect_stale` consumes.
+# Requires: `gh` CLI, `jq`. `GH_TOKEN` env in the workflow context.

 set -euo pipefail

@@ -47,15 +36,14 @@ set -euo pipefail
 # alarming. Override via env for tests + edge ops.
 STALE_HOURS="${STALE_HOURS:-4}"

-# Repo defaults to GITHUB_REPOSITORY (act_runner sets this in workflow
-# context). Tests pass --repo explicitly.
+# Repo defaults to the current `gh` context. Tests pass --repo explicitly.
 REPO="${GITHUB_REPOSITORY:-}"

 # Whether to post a comment to the PR. Off by default to avoid noise on
 # manual ad-hoc runs; the cron workflow turns it on.
 POST_COMMENT="${POST_COMMENT:-false}"

-# Where to read the open-PR JSON from. Empty = call Gitea live. Tests
+# Where to read the open-PR JSON from. Empty = call `gh` live. Tests
 # point this at a fixture file.
 PR_FIXTURE="${PR_FIXTURE:-}"

@@ -63,17 +51,6 @@ PR_FIXTURE="${PR_FIXTURE:-}"
 # the staleness math is deterministic.
 NOW_OVERRIDE="${NOW_OVERRIDE:-}"

-# Gitea API base. act_runner forwards github.server_url as
-# GITHUB_SERVER_URL; for the molecule-ai fleet that's
-# https://git.moleculesai.app. Append /api/v1 to get the REST root.
-# Override directly via GITEA_API_URL for tests / non-default hosts.
-GITEA_API_URL="${GITEA_API_URL:-${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1}"
-
-# Token. Workflow context sets GITHUB_TOKEN; we accept GITEA_TOKEN as
-# the explicit name and GH_TOKEN for back-compat with operator habits
-# from the GitHub era. First non-empty wins.
-GITEA_TOKEN="${GITEA_TOKEN:-${GITHUB_TOKEN:-${GH_TOKEN:-}}}"
-
 while [ $# -gt 0 ]; do
  case "$1" in
    --repo) REPO="$2"; shift 2 ;;
@@ -106,7 +83,7 @@ now_epoch() {
  fi
 }

-# Parse RFC3339 timestamps the way Gitea / GitHub emit them (e.g.
+# Parse RFC3339 timestamps the way GitHub emits them (e.g.
 # "2026-05-05T23:15:00Z"). gnu-date uses -d, bsd-date uses -j -f. Cover
 # both because the workflow runs on ubuntu-latest (gnu) but operators
 # may run this script on macOS (bsd).
@@ -129,100 +106,14 @@ to_epoch() {
 # Fetch open auto-promote PRs
 # -----------------------------------------------------------------------------

-# Gitea v1 returns PRs with the canonical Gitea shape (number, title,
-# created_at, html_url, mergeable, state). The previous GitHub-CLI
-# version returned a derived `mergeStateStatus` / `reviewDecision`
-# pair which only GitHub computes — Gitea doesn't expose them
-# natively. Rebuild equivalents:
-#
-#   mergeStateStatus = BLOCKED  ↔ Gitea: state==open AND mergeable==true
-#                                  AND no APPROVED review yet
-#                                  (i.e. branch protection is gating
-#                                  the auto-merge pending an approval)
-#   reviewDecision   = REVIEW_REQUIRED  ↔ Gitea: 0 APPROVED reviews
-#
-# This mirrors the SAME silent-block failure mode the GitHub version
-# detected: auto-merge armed, branch protection requires 1 review,
-# nobody's approved yet.
-#
-# Implementation: pull the open PR list base=main, then for each PR
-# pull /pulls/{n}/reviews and synthesize the GitHub-shape JSON the
-# rest of the script + the test fixtures consume.
 fetch_prs() {
  if [ -n "$PR_FIXTURE" ]; then
    cat "$PR_FIXTURE"
    return 0
  fi
-  if [ -z "$GITEA_TOKEN" ]; then
-    echo "::error::GITEA_TOKEN / GITHUB_TOKEN unset — cannot fetch PRs from $GITEA_API_URL" >&2
-    return 1
-  fi
-  local prs_json
-  prs_json="$(curl --fail-with-body -sS \
-    -H "Authorization: token ${GITEA_TOKEN}" \
-    -H "Accept: application/json" \
-    "${GITEA_API_URL}/repos/${REPO}/pulls?state=open&base=main&limit=50" \
-    2>/dev/null)" || {
-    echo "::error::Failed to fetch PRs from ${GITEA_API_URL}/repos/${REPO}/pulls" >&2
-    return 1
-  }
-
-  # Filter to head=staging (the auto-promote shape) and synthesize
-  # mergeStateStatus + reviewDecision per PR. Approval count via
-  # /pulls/{n}/reviews. Errors fall through to 0-approvals (treated
-  # as REVIEW_REQUIRED) preserving the existing "fail-safe — alarm if
-  # uncertain" semantic.
-  local synthesized="[]"
-  while IFS= read -r pr; do
-    [ -z "$pr" ] && continue
-    [ "$pr" = "null" ] && continue
-    local num
-    num="$(printf '%s' "$pr" | jq -r '.number')"
-    [ -z "$num" ] && continue
-    [ "$num" = "null" ] && continue
-    local approved_count
-    approved_count="$(curl --fail-with-body -sS \
-      -H "Authorization: token ${GITEA_TOKEN}" \
-      -H "Accept: application/json" \
-      "${GITEA_API_URL}/repos/${REPO}/pulls/${num}/reviews" 2>/dev/null \
-      | jq '[.[] | select(.state == "APPROVED" and (.dismissed // false) == false)] | length' \
-      2>/dev/null || echo 0)"
-    local mergeable
-    mergeable="$(printf '%s' "$pr" | jq -r '.mergeable')"
-    local merge_state="UNKNOWN"
-    local review_decision="REVIEW_REQUIRED"
-    if [ "$mergeable" = "true" ]; then
-      if [ "$approved_count" -ge 1 ]; then
-        merge_state="CLEAN"
-        review_decision="APPROVED"
-      else
-        # mergeable but no approving review — exactly the wedge state
-        # the alarm targets.
-        merge_state="BLOCKED"
-        review_decision="REVIEW_REQUIRED"
-      fi
-    else
-      # not mergeable (conflicts, behind, failed checks) — different
-      # failure mode, the author owns the fix; the alarm doesn't fire.
-      merge_state="DIRTY"
-      review_decision="REVIEW_REQUIRED"
-    fi
-    synthesized="$(printf '%s' "$synthesized" \
-      | jq -c --argjson pr "$pr" \
-              --arg ms "$merge_state" \
-              --arg rd "$review_decision" \
-              '. + [{
-                 number: $pr.number,
-                 title: $pr.title,
-                 createdAt: $pr.created_at,
-                 mergeStateStatus: $ms,
-                 reviewDecision: $rd,
-                 url: $pr.html_url
-              }]')"
-  done < <(printf '%s' "$prs_json" \
-    | jq -c '.[] | select(.head.ref == "staging")' 2>/dev/null)
-
-  printf '%s\n' "$synthesized"
+  gh pr list --repo "$REPO" \
+    --base main --head staging --state open \
+    --json number,title,createdAt,mergeStateStatus,reviewDecision,url
 }

 # -----------------------------------------------------------------------------
@@ -280,40 +171,18 @@ post_comment() {
  if [ "$POST_COMMENT" != "true" ]; then
    return 0
  fi
-  if [ -z "$GITEA_TOKEN" ]; then
-    echo "::warning::GITEA_TOKEN unset — cannot post stale-alarm comment on PR #$pr_num" >&2
-    return 0
-  fi
  # Idempotency: only one alarm comment per PR. Look for the marker
-  # string in existing comments before posting a new one. Gitea's
-  # /repos/{owner}/{repo}/issues/{n}/comments returns the same shape
-  # for issues + PRs (PRs are issues internally on Gitea, same as
-  # GitHub's REST).
+  # string in existing comments before posting a new one.
  local existing
-  existing="$(curl --fail-with-body -sS \
-    -H "Authorization: token ${GITEA_TOKEN}" \
-    -H "Accept: application/json" \
-    "${GITEA_API_URL}/repos/${REPO}/issues/${pr_num}/comments?limit=50" 2>/dev/null \
-    | jq -r '.[] | select(.body | test("scripts/check-stale-promote-pr.sh per issue #2975")) | .id' \
+  existing="$(gh pr view "$pr_num" --repo "$REPO" --json comments \
+    --jq '.comments[] | select(.body | test("scripts/check-stale-promote-pr.sh per issue #2975")) | .databaseId' \
    | head -n1)"
  if [ -n "$existing" ]; then
    echo "::notice::PR #$pr_num already has a stale-alarm comment ($existing) — not re-posting"
    return 0
  fi
-  local body
-  body="$(comment_body "$age_h")"
-  if curl --fail-with-body -sS \
-      -X POST \
-      -H "Authorization: token ${GITEA_TOKEN}" \
-      -H "Accept: application/json" \
-      -H "Content-Type: application/json" \
-      "${GITEA_API_URL}/repos/${REPO}/issues/${pr_num}/comments" \
-      -d "$(jq -nc --arg b "$body" '{body: $b}')" \
-      >/dev/null 2>&1; then
-    echo "::notice::Posted stale-alarm comment on PR #$pr_num (age=${age_h}h)"
-  else
-    echo "::warning::Failed to POST stale-alarm comment on PR #$pr_num" >&2
-  fi
+  comment_body "$age_h" | gh pr comment "$pr_num" --repo "$REPO" --body-file -
+  echo "::notice::Posted stale-alarm comment on PR #$pr_num (age=${age_h}h)"
 }

 # -----------------------------------------------------------------------------
@@ -6,29 +6,6 @@
 #   ./scripts/clone-manifest.sh <manifest.json> <ws-templates-dir> <org-templates-dir> <plugins-dir>
 #
 # Requires: git, jq (lighter than python3 — ~2MB vs ~50MB in Alpine)
-#
-# Auth (optional):
-#   When MOLECULE_GITEA_TOKEN is set, embed it as the basic-auth password so
-#   private Gitea repos clone successfully. When unset, clone anonymously
-#   (works only for repos that are public on git.moleculesai.app).
-#
-#   This is the path the publish-workspace-server-image.yml workflow uses:
-#   it injects AUTO_SYNC_TOKEN (devops-engineer persona PAT, repo:read on
-#   the molecule-ai org) so the in-CI pre-clone step succeeds for ALL
-#   manifest entries — including the 5 private workspace-template-* repos
-#   (codex, crewai, deepagents, gemini-cli, langgraph) and all 7
-#   org-template-* repos.
-#
-#   The token never enters the Docker image: this script runs in the
-#   trusted CI context BEFORE `docker buildx build`, populates
-#   .tenant-bundle-deps/, then `Dockerfile.tenant` COPYs from there with
-#   the .git directories already stripped (see line ~67 below).
-#
-#   For backward compatibility — and so a fresh clone works without
-#   secrets when (eventually) the workspace-template-* repos flip public —
-#   the unset path remains a plain anonymous HTTPS clone. That path will
-#   FAIL with "could not read Username" on private repos today; CI MUST
-#   set MOLECULE_GITEA_TOKEN.

 set -euo pipefail

@@ -68,30 +45,11 @@ clone_category() {
            continue
        fi

-        # Post-2026-05-06 GitHub-org-suspension: clone from Gitea instead.
-        # manifest.json paths still read "Molecule-AI/..." (the historic
-        # github.com slug); Gitea lowercases the org part to "molecule-ai/".
-        # Lowercase the org segment on the fly so we don't need to rewrite
-        # every manifest entry.
-        repo_gitea="$(echo "$repo" | awk -F/ '{ printf "%s", tolower($1); for (i=2; i<=NF; i++) printf "/%s", $i; print "" }')"
-
-        # Build the clone URL. When MOLECULE_GITEA_TOKEN is set (CI path)
-        # embed it as basic-auth so private repos succeed. The username
-        # part ("oauth2") is conventional and ignored by Gitea — only the
-        # token-as-password is verified.
-        if [ -n "${MOLECULE_GITEA_TOKEN:-}" ]; then
-            clone_url="https://oauth2:${MOLECULE_GITEA_TOKEN}@git.moleculesai.app/${repo_gitea}.git"
-            display_url="https://oauth2:***@git.moleculesai.app/${repo_gitea}.git"
-        else
-            clone_url="https://git.moleculesai.app/${repo_gitea}.git"
-            display_url="$clone_url"
-        fi
-
-        echo "  cloning $display_url -> $target_dir/$name (ref=$ref)"
+        echo "  cloning $repo -> $target_dir/$name (ref=$ref)"
        if [ "$ref" = "main" ]; then
-            git clone --depth=1 -q "$clone_url" "$target_dir/$name"
+            git clone --depth=1 -q "https://github.com/${repo}.git" "$target_dir/$name"
        else
-            git clone --depth=1 -q --branch "$ref" "$clone_url" "$target_dir/$name"
+            git clone --depth=1 -q --branch "$ref" "https://github.com/${repo}.git" "$target_dir/$name"
        fi
        CLONED=$((CLONED + 1))
        i=$((i + 1))
@@ -10,11 +10,11 @@
 #           → PyPI auto-bumps molecule-ai-workspace-runtime patch version
 #           → repository_dispatch fans out to 8 workspace-template-* repos
 #           → each template repo rebuilds and re-tags
-#             ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+#             153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-<runtime>:latest
 #
 #   PATH 2: any merge to a workspace-template-* repo's main branch
 #           → that repo's publish-image.yml fires
-#           → ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+#           → 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-<runtime>:latest
 #             gets re-tagged
 #
 #   provisioner.go:296 RuntimeImages[runtime] reads `:latest` at every
@@ -1,155 +0,0 @@
-#!/usr/bin/env bash
-# edge-429-probe.sh — capture 429 origin (workspace-server vs CF/Vercel edge)
-# during a simulated canvas-burst against a tenant subdomain.
-#
-# Issue molecule-core#62. The post-#60 verification step asks an
-# operator with CF/Vercel dashboard access to confirm whether the
-# layout-chunk 429s observed in DevTools were:
-#   (a) workspace-server bucket overflow (closes once #60 deploys), or
-#   (b) actual edge-layer rate-limiting (CF or Vercel).
-#
-# This script doesn't need dashboard access. It reproduces the burst
-# pattern locally and dumps every 429's response shape so the operator
-# can distinguish (a) from (b) by inspection: workspace-server emits a
-# JSON body, CF emits HTML, Vercel emits a different HTML. Headers tell
-# the same story (cf-ray vs x-vercel-*).
-#
-# Usage:
-#   ./scripts/edge-429-probe.sh <tenant-host> [--burst N] [--waves N] [--pause SECS] [--out file]
-#
-# Example:
-#   ./scripts/edge-429-probe.sh hongming.moleculesai.app --burst 80 --out /tmp/edge.txt
-#
-# The script is read-only against the target — it only issues GETs to
-# public-by-design endpoints. No mutating requests, no credential use.
-
-set -euo pipefail
-
-# ── Help / usage handling first, before positional capture ────────────────────
-case "${1:-}" in
-  -h|--help|"")
-    sed -n '/^# edge-429-probe.sh/,/^$/p' "$0" | sed 's/^# \{0,1\}//'
-    exit 0
-    ;;
-esac
-
-HOST="$1"; shift
-BURST=80
-WAVES=3
-WAVE_PAUSE=2
-OUT=""
-
-while [ "${1:-}" != "" ]; do
-  case "$1" in
-    --burst) BURST="$2"; shift 2 ;;
-    --waves) WAVES="$2"; shift 2 ;;
-    --pause) WAVE_PAUSE="$2"; shift 2 ;;
-    --out)   OUT="$2";   shift 2 ;;
-    -h|--help)
-      sed -n '/^# edge-429-probe.sh/,/^$/p' "$0" | sed 's/^# \{0,1\}//'
-      exit 0
-      ;;
-    *) echo "unknown arg: $1" >&2; exit 2 ;;
-  esac
-done
-
-# ── Endpoint discovery ────────────────────────────────────────────────────────
-echo "→ Discovering a layout-chunk URL from canvas root..." >&2
-ROOT_BODY=$(curl -fsSL --max-time 10 "https://${HOST}/" 2>/dev/null || true)
-LAYOUT_PATH=$(echo "$ROOT_BODY" \
-  | grep -oE '/_next/static/chunks/layout-[A-Za-z0-9_-]+\.js' \
-  | head -1 || true)
-if [ -z "$LAYOUT_PATH" ]; then
-  LAYOUT_PATH="/_next/static/chunks/layout-probe-not-found.js"
-  echo "  (no layout chunk discovered — using sentinel path; 404 on this is expected)" >&2
-else
-  echo "  layout chunk: $LAYOUT_PATH" >&2
-fi
-
-# Probe URL: a generic activity endpoint. The rate-limiter middleware
-# runs BEFORE workspace-id validation, so unauth/invalid-id requests
-# still hit the bucket.
-ACTIVITY_PATH="/workspaces/00000000-0000-0000-0000-000000000000/activity?probe=edge-429"
-
-# ── Fire one curl, write a single-line JSON-ish status record to stdout ──────
-# Inlined into xargs as a heredoc-style command rather than a function so
-# the function-export pitfalls (some shells lose `export -f` across xargs)
-# don't apply. Each output line is a parseable record; failed curls emit
-# a curl_err record so request volume is preserved.
-TMP_RESULTS="$(mktemp -t edge-429-probe.XXXXXX)"
-trap 'rm -f "$TMP_RESULTS"' EXIT
-
-run_burst() {
-  # $1 = path; $2 = label; $3 = wave_id
-  local path="$1" label="$2" wave="$3"
-  local i
-  for i in $(seq 1 "$BURST"); do
-    {
-      out=$(curl -sS --max-time 10 -o /dev/null \
-        -w 'status=%{http_code} size=%{size_download} time=%{time_total} server=%{header.server} cf_ray=%{header.cf-ray} x_vercel=%{header.x-vercel-id} retry_after=%{header.retry-after} content_type=%{header.content-type} x_ratelimit_limit=%{header.x-ratelimit-limit} x_ratelimit_remaining=%{header.x-ratelimit-remaining} x_ratelimit_reset=%{header.x-ratelimit-reset}\n' \
-        "https://${HOST}${path}" 2>/dev/null) || out="status=curl_err"
-      printf 'label=%s-%s-%s %s\n' "$label" "$wave" "$i" "$out" >> "$TMP_RESULTS"
-    } &
-  done
-  wait
-}
-
-emit() {
-  if [ -n "$OUT" ]; then
-    printf '%s\n' "$*" >> "$OUT"
-  else
-    printf '%s\n' "$*"
-  fi
-}
-
-if [ -n "$OUT" ]; then : > "$OUT"; fi
-
-emit "# edge-429-probe report"
-emit "# host=$HOST burst=$BURST waves=$WAVES pause=${WAVE_PAUSE}s"
-emit "# layout_path=$LAYOUT_PATH"
-emit "# activity_path=$ACTIVITY_PATH"
-emit "# generated=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
-emit ""
-
-for wave in $(seq 1 "$WAVES"); do
-  emit "## wave $wave"
-  : > "$TMP_RESULTS"
-  run_burst "$LAYOUT_PATH" "layout" "$wave"
-  run_burst "$ACTIVITY_PATH" "activity" "$wave"
-  while read -r line; do
-    emit "  $line"
-  done < "$TMP_RESULTS"
-  if [ "$wave" -lt "$WAVES" ]; then
-    sleep "$WAVE_PAUSE"
-  fi
-done
-
-emit ""
-emit "## summary — how to read the report"
-emit "#   status=429 + content_type starts with application/json + x_ratelimit_limit set"
-emit "#     => workspace-server bucket overflow. Closes when #60 deploys."
-emit "#   status=429 + cf_ray set + content_type=text/html"
-emit "#     => Cloudflare WAF / rate-limit. Audit dashboard rules per #62."
-emit "#   status=429 + x_vercel set + content_type=text/html"
-emit "#     => Vercel edge / Bot Fight Mode. Audit Vercel project per #62."
-emit "#   status=429 with no server/cf_ray/x_vercel"
-emit "#     => corporate proxy or VPN. Not actionable in this repo."
-
-if [ -n "$OUT" ]; then
-  echo "→ Report written to $OUT" >&2
-  # Match only data lines (begin with two-space indent + "label="),
-  # not the summary's reference text which also mentions "status=429".
-  # grep -c outputs "0" + exits 1 when zero matches; `|| true` masks
-  # the exit status so set -e doesn't trip without losing the count.
-  total=$(grep -c '^  label=' "$OUT" 2>/dev/null || true)
-  total429=$(grep -c '^  label=.*status=429' "$OUT" 2>/dev/null || true)
-  total=${total:-0}
-  total429=${total429:-0}
-  echo "→ Totals: ${total429} of ${total} requests returned 429" >&2
-  if [ "${total429}" -gt 0 ]; then
-    echo "→ Per-label 429 counts:" >&2
-    grep '^  label=.*status=429' "$OUT" \
-      | sed -E 's/^  label=([^-]+).*/  \1/' \
-      | sort | uniq -c >&2
-  fi
-fi
@@ -19,15 +19,9 @@ Exit codes:
    0  — no collisions
    1  — collision detected; output names the conflicting PR(s) for the author

-Designed to run from a Gitea Actions PR check. Reads PR metadata via direct
-HTTP calls to Gitea's REST API (`/api/v1/`), which on the molecule-ai fleet
-lives at https://git.moleculesai.app. Runs in under 10s against a typical PR.
-
-Post-2026-05-06 (Gitea migration, issue #75): the previous version called
-the GitHub CLI (``gh pr list``, ``gh pr diff``). On Gitea those calls hit
-either the GraphQL endpoint (HTTP 405) or /api/v3 (HTTP 404). This module
-now talks to /api/v1 directly via urllib so it works against any Gitea
-host without a `gh` install or extra dependencies.
+Designed to run from a GitHub Actions PR check. Reads PR metadata via the
+GitHub CLI (gh) which is preinstalled on ubuntu-latest runners. Runs in
+under 10s against a typical PR.
 """

 from __future__ import annotations
@@ -37,70 +31,12 @@ import os
 import re
 import subprocess
 import sys
-import urllib.error
-import urllib.parse
-import urllib.request
 from pathlib import Path

 MIGRATIONS_DIR = "workspace-server/migrations"
 MIGRATION_FILE_RE = re.compile(r"^(\d+)_[^/]+\.(up|down)\.sql$")


-def _gitea_api_url() -> str:
-    """Resolve the Gitea API base URL.
-
-    act_runner forwards github.server_url as GITHUB_SERVER_URL; for the
-    molecule-ai fleet that's https://git.moleculesai.app. Append /api/v1
-    to get the REST root. Override directly via GITEA_API_URL for tests
-    or non-default hosts.
-    """
-    env_override = os.environ.get("GITEA_API_URL", "").rstrip("/")
-    if env_override:
-        return env_override
-    server = os.environ.get("GITHUB_SERVER_URL", "https://git.moleculesai.app").rstrip("/")
-    return f"{server}/api/v1"
-
-
-def _gitea_token() -> str:
-    """Resolve the Gitea token from env. GITEA_TOKEN wins; falls back
-    to GITHUB_TOKEN (set by act_runner) and GH_TOKEN (operator habit
-    from the GitHub era)."""
-    return (
-        os.environ.get("GITEA_TOKEN")
-        or os.environ.get("GITHUB_TOKEN")
-        or os.environ.get("GH_TOKEN")
-        or ""
-    )
-
-
-def _gitea_get(path: str, params: dict[str, str] | None = None) -> bytes | None:
-    """GET against /api/v1; returns response body or None on HTTP error.
-
-    Errors return None (not raise) because callers handle missing data
-    by emitting an actionable workflow message rather than crashing the
-    PR check on a transient API blip.
-    """
-    base = _gitea_api_url()
-    qs = ""
-    if params:
-        qs = "?" + urllib.parse.urlencode(params)
-    url = f"{base}/{path.lstrip('/')}{qs}"
-    req = urllib.request.Request(url)
-    token = _gitea_token()
-    if token:
-        req.add_header("Authorization", f"token {token}")
-    req.add_header("Accept", "application/json")
-    try:
-        with urllib.request.urlopen(req, timeout=20) as resp:  # noqa: S310
-            return resp.read()
-    except urllib.error.HTTPError as e:
-        sys.stderr.write(f"Gitea API HTTP {e.code} on {path}: {e.reason}\n")
-        return None
-    except (urllib.error.URLError, TimeoutError) as e:
-        sys.stderr.write(f"Gitea API network error on {path}: {e}\n")
-        return None
-
-
 def run(cmd: list[str], check: bool = True) -> str:
    """Run a subprocess and return stdout. Raise on non-zero when check=True."""
    result = subprocess.run(cmd, capture_output=True, text=True)
@@ -160,49 +96,32 @@ def open_prs_with_migration_prefix(
    repo: str, prefix: int, exclude_pr: int
 ) -> list[dict]:
    """Return open PRs (other than `exclude_pr`) that add a migration with
-    `prefix`. Walks open PRs via Gitea's `/repos/{owner}/{repo}/pulls` and
-    pulls each one's changed-file list via `/pulls/{n}/files`. The cost is
-    bounded by open-PR count, which is small (<100) on this repo. The
-    return shape mimics the GitHub CLI's `--json number,headRefName`:
-    ``[{"number": int, "headRefName": str}, ...]``.
+    `prefix`. Uses `gh pr diff` per PR — we only need to walk PRs that are
+    actually in flight, so the cost is bounded by open-PR count.
    """
-    body = _gitea_get(
-        f"repos/{repo}/pulls",
-        {"state": "open", "limit": "50"},
-    )
-    if body is None:
-        # Best-effort: a transient Gitea blip shouldn't fail the PR
-        # check (the base-branch collision check runs locally and is
-        # the more common failure mode).
-        return []
-    prs = json.loads(body)
+    out = run([
+        "gh", "pr", "list", "--repo", repo, "--state", "open",
+        "--json", "number,headRefName", "--limit", "100",
+    ])
+    prs = json.loads(out)
    matches: list[dict] = []
    for pr in prs:
        num = pr["number"]
        if num == exclude_pr:
            continue
-        # Gitea returns the head ref under .head.ref (REST shape);
-        # GitHub CLI's --json headRefName flattens it. Normalize on
-        # the way out so callers see the historical shape.
-        head_ref_name = (pr.get("head") or {}).get("ref", "")
-        files_body = _gitea_get(f"repos/{repo}/pulls/{num}/files", {"limit": "100"})
-        if files_body is None:
-            continue
        try:
-            files = json.loads(files_body)
-        except json.JSONDecodeError:
+            files = run([
+                "gh", "pr", "diff", str(num), "--repo", repo, "--name-only",
+            ], check=False)
+        except Exception:  # noqa: BLE001
            continue
-        for f in files:
-            # Gitea's /pulls/{n}/files returns objects with `.filename`
-            # (same as GitHub's REST). Older Gitea versions emit
-            # `.name` instead — handle both.
-            raw = f.get("filename") or f.get("name") or ""
+        for raw in files.splitlines():
            path = Path(raw.strip())
            if not path.name:
                continue
            m = MIGRATION_FILE_RE.match(path.name)
            if m and int(m.group(1)) == prefix:
-                matches.append({"number": num, "headRefName": head_ref_name})
+                matches.append(pr)
                break
    return matches

@@ -219,10 +138,7 @@ def main() -> int:
    pr_number = int(pr_number_env)
    base_ref = os.environ.get("BASE_REF", "origin/staging")
    head_ref = os.environ.get("HEAD_REF", "HEAD")
-    # Default kept lowercase to match the Gitea-canonical org name
-    # (post-2026-05-06 migration). Tests + workflow context override
-    # via GITHUB_REPOSITORY which act_runner sets per-run.
-    repo = os.environ.get("GITHUB_REPOSITORY", "molecule-ai/molecule-core")
+    repo = os.environ.get("GITHUB_REPOSITORY", "Molecule-AI/molecule-core")

    added = migrations_in_diff(base_ref, head_ref)
    if not added:
@@ -51,7 +51,7 @@ log "pulling latest images for: ${RUNTIMES[*]}"
 PULLED=()
 FAILED=()
 for rt in "${RUNTIMES[@]}"; do
-  IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest"
+  IMG="153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-$rt:latest"
  if docker pull "$IMG" >/dev/null 2>&1; then
    log "  ✓ $rt"
    PULLED+=("$rt")
@@ -1,9 +1,10 @@
 #!/bin/bash
-# rollback-latest.sh — moves the :latest tag on ghcr.io/molecule-ai/platform
-# (and the matching tenant image) back to a prior :staging-<sha> digest
-# without rebuilding anything. Prod tenants auto-pull :latest every 5
-# min, so this is the fast path when a canary-verified image turns out
-# to have a runtime regression that canary didn't catch.
+# rollback-latest.sh — moves the :latest tag on the platform image
+# (and the matching tenant image) on AWS ECR back to a prior
+# :staging-<sha> digest without rebuilding anything. Prod tenants
+# auto-pull :latest every 5 min, so this is the fast path when a
+# canary-verified image turns out to have a runtime regression that
+# canary didn't catch.
 #
 # Usage:
 #   scripts/rollback-latest.sh <sha>
@@ -12,12 +13,14 @@
 # Prereqs:
 #   - crane on $PATH (brew install crane OR download from
 #     https://github.com/google/go-containerregistry/releases)
-#   - GHCR token exported as GITHUB_TOKEN with write:packages scope
+#   - aws CLI authenticated for region us-east-2 with ECR pull/push
+#     access to the molecule-ai/platform + platform-tenant repositories.
+#     `aws sts get-caller-identity` should succeed.
 #
 # What it does (per image — platform + tenant):
-#   crane digest ghcr.io/…:<sha>         # verify the target sha exists
-#   crane tag    ghcr.io/…:<sha> latest  # retag remotely, single API call
-#   crane digest ghcr.io/…:latest        # confirm the move
+#   crane digest <ecr>:<sha>         # verify the target sha exists
+#   crane tag    <ecr>:<sha> latest  # retag remotely, single API call
+#   crane digest <ecr>:latest        # confirm the move
 #
 # Exit codes: 0 = both retagged, 1 = tag missing / crane error, 2 = bad args.

@@ -30,21 +33,23 @@ if [ "${1:-}" = "" ]; then
 fi

 TARGET_SHA="$1"
-PLATFORM=ghcr.io/molecule-ai/platform
-TENANT=ghcr.io/molecule-ai/platform-tenant
+ECR_HOST=153263036946.dkr.ecr.us-east-2.amazonaws.com
+PLATFORM=$ECR_HOST/molecule-ai/platform
+TENANT=$ECR_HOST/molecule-ai/platform-tenant

 if ! command -v crane >/dev/null; then
  echo "ERROR: crane not installed. brew install crane" >&2
  exit 1
 fi
-if [ -z "${GITHUB_TOKEN:-}" ]; then
-  echo "ERROR: GITHUB_TOKEN unset. export it with write:packages scope." >&2
+if ! command -v aws >/dev/null; then
+  echo "ERROR: aws CLI not installed. brew install awscli" >&2
  exit 1
 fi

-# Log in once. crane stores creds in a config file keyed by registry;
-# re-running is cheap.
-printf '%s\n' "$GITHUB_TOKEN" | crane auth login ghcr.io -u "${GITHUB_ACTOR:-$(whoami)}" --password-stdin >/dev/null
+# Log in once. ECR auth is via short-lived password from `aws ecr
+# get-login-password`. crane stores creds in a config file keyed by
+# registry; re-running is cheap.
+aws ecr get-login-password --region us-east-2 | crane auth login "$ECR_HOST" -u AWS --password-stdin >/dev/null

 roll() {
  local image="$1"
@@ -1,252 +0,0 @@
-#!/usr/bin/env bash
-# tools/branch-protection/check_name_parity.sh — assert every required-
-# check name listed in apply.sh maps to a workflow job whose "always
-# emits this status" shape is intact.
-#
-# Closes #144 / encodes the saved memory
-# feedback_branch_protection_check_name_parity:
-#
-#   "Path filters (e.g., detect-changes → conditional skip) silently
-#    break branch protection because no job emits the protected
-#    sentinel status when path-filter returns false."
-#
-# Two safe shapes for a required-check job:
-#
-#   1. Single-job-with-per-step-if (path-filter case):
-#      The workflow has NO top-level `paths:` filter; the always-running
-#      job has steps gated on `if: needs.<gate>.outputs.<flag> == 'true'`
-#      so the no-op step alone fires when paths exclude the commit.
-#      Used by ci.yml's Platform/Canvas/Python/Shellcheck and by
-#      e2e-api.yml / e2e-staging-canvas.yml / runtime-prbuild-compat.yml.
-#
-#   2. Aggregator-with-needs+always() (matrix-refactor case):
-#      An aggregator job named after the protected check `needs:` the
-#      matrix children + uses `if: always()` + checks each child's
-#      result. (Not currently in this repo but supported.)
-#
-# Unsafe shape this script catches:
-#   - Workflow has top-level `paths:` filter AND the protected check
-#     name is on a single job. When paths-filter excludes a commit, the
-#     workflow doesn't fire — branch protection waits forever.
-#
-# Exit codes:
-#   0 — every required check name has at least one safe-shape match
-#   1 — a required name has no match OR matches an unsafe shape
-#   2 — script-internal error (apply.sh missing, awk failure, etc.)
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-WORKFLOWS_DIR="$REPO_ROOT/.github/workflows"
-APPLY_SH="$SCRIPT_DIR/apply.sh"
-
-if [[ ! -f "$APPLY_SH" ]]; then
-  echo "check_name_parity: missing apply.sh at $APPLY_SH" >&2
-  exit 2
-fi
-if [[ ! -d "$WORKFLOWS_DIR" ]]; then
-  echo "check_name_parity: missing .github/workflows at $WORKFLOWS_DIR" >&2
-  exit 2
-fi
-
-# ─── Extract the union of required check names from apply.sh ──────
-# apply.sh has STAGING_CHECKS and MAIN_CHECKS heredocs; union them so
-# we audit any name that gates EITHER branch. Filters out blank lines
-# and the heredoc end marker. Sorted + uniq so the audit output is stable.
-#
-# Captures the heredoc end-marker dynamically from the `<<'MARKER'`
-# token on the opening line — the token can be `EOF` (production
-# apply.sh), `EOF2` (test fixtures with nested heredocs), or any other
-# bash-legal identifier. Without dynamic extraction, test fixtures
-# with nested heredocs would either skip-capture (wrong end marker)
-# or capture the inner end marker as a stray check name.
-#
-# Two-step approach to keep awk-portable across BSD awk (macOS) and
-# gawk (Linux): grep finds the heredoc-opening lines, sed extracts the
-# marker, then awk does the capture. Pure-awk attempts hit BSD-vs-GNU
-# regex/variable-init differences that regress silently — this shape
-# stays in POSIX-portable territory.
-extract_heredoc_block() {
-  local file="$1"
-  local marker="$2"
-  awk -v marker="$marker" '
-    $0 ~ "<<.?" marker { capture=1; next }
-    $0 == marker && capture { capture=0; next }
-    capture && NF { print }
-  ' "$file"
-}
-
-# Find every heredoc-end marker used in apply.sh (typically just EOF
-# in the production script, but EOF2 / TAG / ABC are all valid in
-# fixtures or future expansions). Each marker maps to one or more
-# heredoc blocks; we union all of them.
-markers=$(grep -E "<<['\"]?[A-Za-z0-9_]+['\"]?[[:space:]]*\\|\\|" "$APPLY_SH" \
-  | sed -E "s/.*<<['\"]?([A-Za-z0-9_]+)['\"]?.*/\\1/" \
-  | sort -u)
-
-required_names=""
-while IFS= read -r marker; do
-  [[ -z "$marker" ]] && continue
-  block=$(extract_heredoc_block "$APPLY_SH" "$marker")
-  if [[ -n "$block" ]]; then
-    required_names+="$block"$'\n'
-  fi
-done <<< "$markers"
-
-required_names=$(printf '%s' "$required_names" | sort -u | sed '/^$/d')
-
-if [[ -z "$required_names" ]]; then
-  echo "check_name_parity: failed to extract required check names from apply.sh" >&2
-  exit 2
-fi
-
-# ─── For each required name, find the workflow file that owns it ──
-# A workflow "owns" a name if any `name:` line in the file equals the
-# required name. We look at job-level names AND the workflow-level
-# `name:` (the latter prefixes "Analyze" jobs in codeql.yml).
-#
-# Then we check whether the owning workflow has a top-level `paths:`
-# filter. The unsafe shape is:
-#   - top-level paths: filter present
-#   - AND the named job is gated only at the workflow level (no per-
-#     step `if:` gates)
-#
-# Distinguishing "no `paths:` filter" from "paths: filter + per-step
-# gating" requires parsing the YAML semantics. We do it heuristically:
-#
-#   - "no top-level paths:"     → safe by construction (workflow always
-#                                  fires)
-#   - "paths: present"          → check that the matching job has at
-#                                  least one `if: needs.<x>.outputs`
-#                                  step gate. If yes, that's the
-#                                  single-job-with-per-step-if shape.
-#                                  If no, flag as unsafe.
-#
-# Heuristic so it stays a portable bash + awk + grep tool — full YAML
-# parsing would need yq which isn't a dependency. The known unsafe
-# shape (workflow-level paths: AND no per-step if-gates) is what we're
-# trying to catch.
-
-failed=0
-declare -a unsafe_findings=()
-
-while IFS= read -r name; do
-  [[ -z "$name" ]] && continue
-  # Find every workflow file that contains a job with `name: <name>` or
-  # whose top-level workflow `name:` plus matrix substitution would
-  # produce <name>. Need to be careful about quoting — YAML allows
-  # `name: Foo`, `name: "Foo"`, `name: 'Foo'`. Strip quotes.
-  matches=()
-  while IFS= read -r f; do
-    # Look for an exact `name:` match (anywhere in the file). The
-    # workflow-level name line is at column 0; job-level names are
-    # indented. Either is acceptable for parity — what matters is
-    # whether the EMITTED check-run name is the one we required.
-    # Strip surrounding quotes/whitespace before comparing.
-    if awk -v want="$name" '
-      /^[[:space:]]*name:[[:space:]]*/ {
-        line = $0
-        sub(/^[[:space:]]*name:[[:space:]]*/, "", line)
-        # Strip surrounding " or '\''
-        gsub(/^["\047]|["\047]$/, "", line)
-        # Strip trailing whitespace + comment
-        sub(/[[:space:]]*#.*$/, "", line)
-        sub(/[[:space:]]+$/, "", line)
-        if (line == want) found = 1
-      }
-      END { exit !found }
-    ' "$f"; then
-      matches+=("$f")
-    fi
-  done < <(find "$WORKFLOWS_DIR" -name '*.yml' -o -name '*.yaml')
-
-  if [[ ${#matches[@]} -eq 0 ]]; then
-    # Special case — Analyze (go/javascript-typescript/python) is
-    # generated by codeql.yml's matrix expansion of `Analyze (${{
-    # matrix.language }})`. Don't flag those as missing if codeql.yml
-    # exists with the expected base name.
-    case "$name" in
-      "Analyze (go)"|"Analyze (javascript-typescript)"|"Analyze (python)")
-        # shellcheck disable=SC2016
-        # The literal `${{ matrix.language }}` is the GHA template
-        # syntax we're searching FOR — not a shell expansion. SC2016
-        # would have us add quotes that defeat the search.
-        if [[ -f "$WORKFLOWS_DIR/codeql.yml" ]] && \
-           grep -q 'name: Analyze (${{[[:space:]]*matrix.language[[:space:]]*}})' "$WORKFLOWS_DIR/codeql.yml"; then
-          matches=("$WORKFLOWS_DIR/codeql.yml")
-        fi
-        ;;
-    esac
-  fi
-
-  if [[ ${#matches[@]} -eq 0 ]]; then
-    unsafe_findings+=("MISSING: required check name '$name' has no matching workflow job")
-    failed=1
-    continue
-  fi
-
-  # For each owning workflow, classify safe vs unsafe.
-  for f in "${matches[@]}"; do
-    rel="${f#"$REPO_ROOT"/}"
-    # Heuristic: does the workflow have a top-level `paths:` filter?
-    # Top-level here means under the `on:` key, not under jobs.<x>.if.
-    # Workflow-level paths filters appear at indent depth 4 (under
-    # `push:` or `pull_request:`). Job-level `if:` paths-filter doesn't
-    # block the workflow from firing.
-    has_top_paths=0
-    if awk '
-      # Track whether we are inside the `on:` block. The `on:` block
-      # starts at column 0 (`on:` key) and ends when the next column-0
-      # key appears.
-      /^on:[[:space:]]*$/ { in_on = 1; next }
-      /^[a-zA-Z]/ && in_on { in_on = 0 }
-      in_on && /^[[:space:]]+paths:[[:space:]]*$/ { print "yes"; exit }
-      in_on && /^[[:space:]]+paths:[[:space:]]*\[/ { print "yes"; exit }
-    ' "$f" | grep -q yes; then
-      has_top_paths=1
-    fi
-
-    if [[ "$has_top_paths" -eq 0 ]]; then
-      # Safe: workflow always fires. If there are inner per-step if-
-      # gates (single-job-with-per-step-if pattern), the no-op step
-      # produces SUCCESS for the protected name — branch-protection-clean.
-      continue
-    fi
-
-    # Unsafe candidate — has top-level paths: AND we need to verify
-    # the per-step if-gate pattern is absent. Look for any `if:`
-    # referencing a paths-filter / detect-changes output inside the
-    # owning job's body. If at least one is present, classify as the
-    # single-job-with-per-step-if pattern (safe).
-    #
-    # The regex is intentionally anchored loosely — actual workflow
-    # YAML writes per-step if-gates as `      - if: needs.X.outputs.Y`
-    # (with the `-` step-marker between the leading spaces and the
-    # `if`). Anchoring on `^[[:space:]]+if:` would miss those.
-    if grep -qE "if:[[:space:]]+needs\.[a-zA-Z_-]+\.outputs\." "$f"; then
-      # Per-step if-gates exist. Combined with top-level paths: this
-      # would be a buggy mix (the workflow might still skip entirely
-      # when paths exclude). Flag as unsafe — the safe pattern omits
-      # the top-level paths: filter altogether and gates per-step.
-      unsafe_findings+=("UNSAFE-MIX: $rel has top-level paths: AND per-step if-gates — when paths exclude the commit, the workflow doesn't fire and the required check '$name' is silently absent. Drop the top-level paths: filter; keep the per-step if-gates.")
-      failed=1
-    else
-      # Top-level paths: with no per-step if-gates: the canonical
-      # check-name parity bug.
-      unsafe_findings+=("UNSAFE-PATH-FILTER: $rel has top-level paths: filter and no per-step if-gates. When paths exclude the commit, no job emits the required check '$name' — branch protection waits forever. Either drop the paths: filter and add per-step if-gates against a detect-changes output, or add an aggregator-with-needs+always() job that emits '$name'.")
-      failed=1
-    fi
-  done
-done <<< "$required_names"
-
-if [[ "$failed" -eq 0 ]]; then
-  echo "check_name_parity: OK — every required check name maps to a safe workflow shape."
-  exit 0
-fi
-
-echo "check_name_parity: FOUND $((${#unsafe_findings[@]})) issue(s):" >&2
-for finding in "${unsafe_findings[@]}"; do
-  echo "  - $finding" >&2
-done
-exit 1
@@ -1,285 +0,0 @@
-#!/usr/bin/env bash
-# tools/branch-protection/test_check_name_parity.sh — unit tests for
-# check_name_parity.sh.
-#
-# Builds synthetic apply.sh + workflow files in a tmpdir for each case,
-# invokes the script with REPO_ROOT pointing at the tmpdir, and asserts
-# on exit code + stderr. Per feedback_assert_exact_not_substring we
-# pin the EXACT exit code AND a substring of the stderr that names the
-# offending workflow + name combo — so a "false-pass that prints the
-# wrong message" still fails the test.
-#
-# Run locally: bash tools/branch-protection/test_check_name_parity.sh
-# Run in CI:  same — added to ci.yml's shellcheck job's "E2E bash unit
-#             tests" step alongside test_model_slug.sh.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SCRIPT_UNDER_TEST="$SCRIPT_DIR/check_name_parity.sh"
-
-if [[ ! -x "$SCRIPT_UNDER_TEST" ]]; then
-  echo "test_check_name_parity: script under test missing or not executable: $SCRIPT_UNDER_TEST" >&2
-  exit 2
-fi
-
-PASSED=0
-FAILED=0
-
-# Tracks the active tmpdir for the running case so the trap can clean
-# up even when assertions abort the case mid-flight.
-TMPDIR_FOR_CASE=""
-trap '[[ -n "$TMPDIR_FOR_CASE" && -d "$TMPDIR_FOR_CASE" ]] && rm -rf "$TMPDIR_FOR_CASE"' EXIT
-
-# Build a synthetic repo at $1 with apply.sh listing $2 (one name per
-# line) as the staging required set + zero main required, then write
-# whatever .github/workflows/* files the test case adds.
-make_fake_repo() {
-  local root="$1"
-  local checks="$2"
-  mkdir -p "$root/tools/branch-protection"
-  mkdir -p "$root/.github/workflows"
-  cat > "$root/tools/branch-protection/apply.sh" <<EOF
-#!/usr/bin/env bash
-# Stub apply.sh — only the heredoc-shaped check lists matter for the
-# parity script. Other functions intentionally absent.
-
-read -r -d '' STAGING_CHECKS <<'EOF2' || true
-$checks
-EOF2
-
-read -r -d '' MAIN_CHECKS <<'EOF2' || true
-$checks
-EOF2
-EOF
-  chmod +x "$root/tools/branch-protection/apply.sh"
-  # Place the script-under-test alongside its sibling apply.sh so the
-  # script's REPO_ROOT walk finds the synthetic .github/workflows/.
-  cp "$SCRIPT_UNDER_TEST" "$root/tools/branch-protection/check_name_parity.sh"
-}
-
-run_case() {
-  local desc="$1"
-  local checks="$2"
-  local workflow_yaml="$3"   # contents to write
-  local workflow_filename="$4"
-  local expected_exit="$5"
-  local expected_stderr_substring="$6"
-  TMPDIR_FOR_CASE=$(mktemp -d)
-  make_fake_repo "$TMPDIR_FOR_CASE" "$checks"
-  printf '%s' "$workflow_yaml" > "$TMPDIR_FOR_CASE/.github/workflows/$workflow_filename"
-  local stderr_file
-  stderr_file=$(mktemp)
-  local actual_exit=0
-  bash "$TMPDIR_FOR_CASE/tools/branch-protection/check_name_parity.sh" 2>"$stderr_file" >/dev/null || actual_exit=$?
-  local stderr_content
-  stderr_content=$(cat "$stderr_file")
-  rm "$stderr_file"
-  if [[ "$actual_exit" -ne "$expected_exit" ]]; then
-    echo "FAIL: $desc"
-    echo "  expected exit: $expected_exit, got: $actual_exit"
-    echo "  stderr: $stderr_content"
-    FAILED=$((FAILED+1))
-    rm -rf "$TMPDIR_FOR_CASE"; TMPDIR_FOR_CASE=""
-    return
-  fi
-  # Empty expected substring → no assertion on stderr (used for the
-  # passing case where stderr should be empty / not interesting).
-  if [[ -n "$expected_stderr_substring" ]]; then
-    if ! grep -qF "$expected_stderr_substring" <<< "$stderr_content"; then
-      echo "FAIL: $desc"
-      echo "  expected stderr to contain: '$expected_stderr_substring'"
-      echo "  actual stderr: $stderr_content"
-      FAILED=$((FAILED+1))
-      rm -rf "$TMPDIR_FOR_CASE"; TMPDIR_FOR_CASE=""
-      return
-    fi
-  fi
-  echo "PASS: $desc"
-  PASSED=$((PASSED+1))
-  rm -rf "$TMPDIR_FOR_CASE"; TMPDIR_FOR_CASE=""
-}
-
-# Case 1: safe workflow — no top-level paths: filter, single job
-# emitting the required name. Should exit 0.
-run_case "safe: no paths filter, job emits required name" \
-  "Foo Build" \
-  "$(cat <<'EOF'
-name: Foo
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-
-jobs:
-  foo:
-    name: Foo Build
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo ok
-EOF
-)" \
-  "foo.yml" \
-  0 \
-  ""
-
-# Case 2: unsafe — top-level paths: filter AND no per-step if-gates.
-# This is the silent-block shape from the saved memory.
-run_case "unsafe: top-level paths: filter without per-step if-gates" \
-  "Bar Build" \
-  "$(cat <<'EOF'
-name: Bar
-
-on:
-  push:
-    branches: [main]
-    paths:
-      - 'bar/**'
-  pull_request:
-    paths:
-      - 'bar/**'
-
-jobs:
-  bar:
-    name: Bar Build
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo ok
-EOF
-)" \
-  "bar.yml" \
-  1 \
-  "UNSAFE-PATH-FILTER"
-
-# Case 3: required name has no emitter at all.
-run_case "missing: required name not in any workflow" \
-  "Nonexistent Job" \
-  "$(cat <<'EOF'
-name: Other
-
-on:
-  pull_request:
-
-jobs:
-  other:
-    name: Other Job
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo ok
-EOF
-)" \
-  "other.yml" \
-  1 \
-  "MISSING: required check name 'Nonexistent Job'"
-
-# Case 4: safe — top-level paths: filter is absent BUT per-step if-
-# gates are present (single-job-with-per-step-if pattern, what
-# ci.yml + e2e-api.yml use). Should exit 0.
-run_case "safe: per-step if-gates without top-level paths" \
-  "Baz Build" \
-  "$(cat <<'EOF'
-name: Baz
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-
-jobs:
-  changes:
-    name: Detect changes
-    runs-on: ubuntu-latest
-    outputs:
-      baz: ${{ steps.check.outputs.baz }}
-    steps:
-      - id: check
-        run: echo "baz=true" >> "$GITHUB_OUTPUT"
-
-  baz:
-    needs: changes
-    name: Baz Build
-    runs-on: ubuntu-latest
-    steps:
-      - if: needs.changes.outputs.baz != 'true'
-        run: echo no-op
-      - if: needs.changes.outputs.baz == 'true'
-        run: echo real work
-EOF
-)" \
-  "baz.yml" \
-  0 \
-  ""
-
-# Case 5: unsafe-mix — top-level paths: AND per-step if-gates. The
-# script flags this distinctly because the workflow may STILL skip
-# entirely when paths exclude the commit (the per-step gates only
-# matter if the workflow actually fires).
-run_case "unsafe-mix: top-level paths: AND per-step if-gates" \
-  "Qux Build" \
-  "$(cat <<'EOF'
-name: Qux
-
-on:
-  push:
-    branches: [main]
-    paths:
-      - 'qux/**'
-  pull_request:
-    paths:
-      - 'qux/**'
-
-jobs:
-  changes:
-    name: Detect changes
-    runs-on: ubuntu-latest
-    outputs:
-      qux: ${{ steps.check.outputs.qux }}
-    steps:
-      - id: check
-        run: echo "qux=true" >> "$GITHUB_OUTPUT"
-
-  qux:
-    needs: changes
-    name: Qux Build
-    runs-on: ubuntu-latest
-    steps:
-      - if: needs.changes.outputs.qux == 'true'
-        run: echo build
-EOF
-)" \
-  "qux.yml" \
-  1 \
-  "UNSAFE-MIX"
-
-# Case 6: codeql.yml matrix — required names like "Analyze (go)" are
-# generated by `Analyze (${{ matrix.language }})`. Script must
-# special-case match this pattern.
-run_case "matrix: codeql Analyze (go) is recognised via matrix expansion" \
-  "$(printf 'Analyze (go)\nAnalyze (javascript-typescript)\nAnalyze (python)')" \
-  "$(cat <<'EOF'
-name: CodeQL
-
-on:
-  pull_request:
-
-jobs:
-  analyze:
-    name: Analyze (${{ matrix.language }})
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        language: [go, javascript-typescript, python]
-    steps:
-      - run: echo analyse
-EOF
-)" \
-  "codeql.yml" \
-  0 \
-  ""
-
-echo ""
-echo "================================================"
-echo "test_check_name_parity: $PASSED passed, $FAILED failed"
-echo "================================================"
-exit "$FAILED"
@@ -18,7 +18,7 @@
 #
 # Or inline via curl:
 #
-#     bash <(curl -fsSL https://raw.githubusercontent.com/Molecule-AI/molecule-core/main/tools/check-template-parity.sh) \
+#     bash <(curl -fsSL https://git.moleculesai.app/molecule-ai/molecule-core/raw/branch/main/tools/check-template-parity.sh) \
 #          install.sh start.sh
 #
 # Exit codes:
@@ -1,15 +1,7 @@
-# Platform-only image (no canvas). Used by publish-workspace-server-image
-# workflow for ECR. Tenant image uses Dockerfile.tenant instead.
+# Platform-only image (no canvas). Used by publish-platform-image workflow
+# for GHCR + Fly registry. Tenant image uses Dockerfile.tenant instead.
 #
-# Templates + plugins are pre-cloned by scripts/clone-manifest.sh (in CI
-# or on the operator host) into .tenant-bundle-deps/ — same pattern as
-# Dockerfile.tenant. See that file's header for the full rationale; the
-# short version is that post-2026-05-06 every workspace-template-* and
-# org-template-* repo on Gitea is private, so an in-image `git clone`
-# has no auth path that doesn't leak the Gitea token into a layer.
-#
-# Build context: repo root, with `.tenant-bundle-deps/` populated by the
-# workflow's "Pre-clone manifest deps" step (Task #173).
+# Build context: repo root.

 FROM golang:1.25-alpine AS builder
 WORKDIR /app
@@ -34,18 +26,21 @@ RUN CGO_ENABLED=0 GOOS=linux go build \
    -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
    -o /memory-plugin ./cmd/memory-plugin-postgres

+# Clone templates + plugins at build time from manifest.json
+FROM alpine:3.20 AS templates
+RUN apk add --no-cache git jq
+COPY manifest.json /manifest.json
+COPY scripts/clone-manifest.sh /scripts/clone-manifest.sh
+RUN chmod +x /scripts/clone-manifest.sh && /scripts/clone-manifest.sh /manifest.json /workspace-configs-templates /org-templates /plugins
+
 FROM alpine:3.20
 RUN apk add --no-cache ca-certificates git tzdata wget
 COPY --from=builder /platform /platform
 COPY --from=builder /memory-plugin /memory-plugin
 COPY workspace-server/migrations /migrations
-# Templates + plugins (pre-cloned by scripts/clone-manifest.sh in the
-# trusted CI / operator-host context, .git already stripped). The Gitea
-# token used to clone them never enters this image — same shape as
-# Dockerfile.tenant.
-COPY .tenant-bundle-deps/workspace-configs-templates /workspace-configs-templates
-COPY .tenant-bundle-deps/org-templates /org-templates
-COPY .tenant-bundle-deps/plugins /plugins
+COPY --from=templates /workspace-configs-templates /workspace-configs-templates
+COPY --from=templates /org-templates /org-templates
+COPY --from=templates /plugins /plugins
 # Non-root runtime with Docker socket access for workspace provisioning.
 RUN addgroup -g 1000 platform && adduser -u 1000 -G platform -s /bin/sh -D platform
 EXPOSE 8080
@@ -3,34 +3,14 @@
 # Serves both the API (Go on :8080) and the UI (Node.js on :3000) in a
 # single container. Go reverse-proxies unknown routes to canvas.
 #
-# Templates + plugins are NOT cloned at build time. They are pre-cloned
-# in the trusted CI context (or operator host) by
-# `scripts/clone-manifest.sh` into `.tenant-bundle-deps/` and COPYed in.
-# The reason: post-2026-05-06, every workspace-template-* repo on Gitea
-# (codex, crewai, deepagents, gemini-cli, langgraph) plus all 7
-# org-template-* repos are private, so the Docker build can't `git clone`
-# from inside the build context — there's no auth path that doesn't leak
-# the Gitea token into an image layer. Pre-cloning keeps the token in
-# the CI environment only; the resulting image carries the cloned trees
-# with `.git` already stripped (see clone-manifest.sh).
+# Templates are cloned from standalone GitHub repos at build time so the
+# monorepo doesn't need to carry them. The repos are public; no auth.
 #
-# Build context: repo root, with `.tenant-bundle-deps/` populated by:
-#
-#     MOLECULE_GITEA_TOKEN=<persona-PAT> scripts/clone-manifest.sh \
-#       manifest.json \
-#       .tenant-bundle-deps/workspace-configs-templates \
-#       .tenant-bundle-deps/org-templates \
-#       .tenant-bundle-deps/plugins
-#
-# In CI this happens in publish-workspace-server-image.yml's "Pre-clone
-# manifest deps" step (uses AUTO_SYNC_TOKEN = devops-engineer persona).
-# For a manual operator-host build, source the same token from
-# /etc/molecule-bootstrap/agent-secrets.env first.
+# Build context: repo root.
 #
 #   docker buildx build --platform linux/amd64 \
 #     -f workspace-server/Dockerfile.tenant \
-#     -t <ECR>/molecule-ai/platform-tenant:latest \
-#     --build-arg GIT_SHA=<sha> --build-arg NEXT_PUBLIC_PLATFORM_URL= \
+#     -t registry.fly.io/molecule-tenant:latest \
 #     --push .

 # ── Stage 1: Go platform binary ──────────────────────────────────────
@@ -75,7 +55,14 @@ ENV NEXT_PUBLIC_PLATFORM_URL=$NEXT_PUBLIC_PLATFORM_URL
 ENV NEXT_PUBLIC_WS_URL=$NEXT_PUBLIC_WS_URL
 RUN npm run build

-# ── Stage 3: Runtime ──────────────────────────────────────────────────
+# ── Stage 3: Clone templates + plugins from manifest.json ─────────────
+FROM alpine:3.20 AS templates
+RUN apk add --no-cache git jq
+COPY manifest.json /manifest.json
+COPY scripts/clone-manifest.sh /scripts/clone-manifest.sh
+RUN chmod +x /scripts/clone-manifest.sh && /scripts/clone-manifest.sh /manifest.json /workspace-configs-templates /org-templates /plugins
+
+# ── Stage 4: Runtime ──────────────────────────────────────────────────
 FROM node:20-alpine
 RUN apk add --no-cache ca-certificates git tzdata openssh-client aws-cli

@@ -100,13 +87,10 @@ COPY --from=go-builder /platform /platform
 COPY --from=go-builder /memory-plugin /memory-plugin
 COPY workspace-server/migrations /migrations

-# Templates + plugins (pre-cloned by scripts/clone-manifest.sh in the
-# trusted CI / operator-host context, .git already stripped — see
-# .tenant-bundle-deps/ in the build context). The Gitea token used to
-# clone them never enters this image.
-COPY .tenant-bundle-deps/workspace-configs-templates /workspace-configs-templates
-COPY .tenant-bundle-deps/org-templates /org-templates
-COPY .tenant-bundle-deps/plugins /plugins
+# Templates + plugins (cloned from GitHub in stage 3)
+COPY --from=templates /workspace-configs-templates /workspace-configs-templates
+COPY --from=templates /org-templates /org-templates
+COPY --from=templates /plugins /plugins

 # Canvas standalone
 WORKDIR /canvas
@@ -1,89 +0,0 @@
-package main
-
-import "testing"
-
-// TestResolveBindHost pins the precedence: BIND_ADDR explicit > dev-mode
-// fail-open default of 127.0.0.1 > production-shape empty (all interfaces).
-//
-// Mutation-test invariant: removing the IsDevModeFailOpen() branch makes
-// "no_bindaddr_devmode_unset_admin" fail (returns "" instead of "127.0.0.1").
-// Removing the BIND_ADDR branch makes "explicit_bindaddr_*" cases fail.
-func TestResolveBindHost(t *testing.T) {
-	cases := []struct {
-		name       string
-		bindAddr   string
-		adminToken string
-		molEnv     string
-		want       string
-	}{
-		{
-			name:       "no_bindaddr_devmode_unset_admin",
-			bindAddr:   "",
-			adminToken: "",
-			molEnv:     "dev",
-			want:       "127.0.0.1",
-		},
-		{
-			name:       "no_bindaddr_devmode_unset_admin_full_word",
-			bindAddr:   "",
-			adminToken: "",
-			molEnv:     "development",
-			want:       "127.0.0.1",
-		},
-		{
-			name:       "no_bindaddr_admin_set_in_dev_env",
-			bindAddr:   "",
-			adminToken: "secret",
-			molEnv:     "dev",
-			want:       "", // ADMIN_TOKEN flips IsDevModeFailOpen to false → all interfaces
-		},
-		{
-			name:       "no_bindaddr_production_env",
-			bindAddr:   "",
-			adminToken: "",
-			molEnv:     "production",
-			want:       "", // production is not a dev value → all interfaces
-		},
-		{
-			name:       "no_bindaddr_unset_env",
-			bindAddr:   "",
-			adminToken: "",
-			molEnv:     "",
-			want:       "", // unset MOLECULE_ENV → not dev → all interfaces
-		},
-		{
-			name:       "explicit_bindaddr_loopback_overrides_devmode",
-			bindAddr:   "127.0.0.1",
-			adminToken: "",
-			molEnv:     "dev",
-			want:       "127.0.0.1",
-		},
-		{
-			name:       "explicit_bindaddr_wildcard_overrides_devmode_default",
-			bindAddr:   "0.0.0.0",
-			adminToken: "",
-			molEnv:     "dev",
-			want:       "0.0.0.0",
-		},
-		{
-			name:       "explicit_bindaddr_in_production",
-			bindAddr:   "10.0.5.7",
-			adminToken: "secret",
-			molEnv:     "production",
-			want:       "10.0.5.7",
-		},
-	}
-
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			t.Setenv("BIND_ADDR", tc.bindAddr)
-			t.Setenv("ADMIN_TOKEN", tc.adminToken)
-			t.Setenv("MOLECULE_ENV", tc.molEnv)
-			got := resolveBindHost()
-			if got != tc.want {
-				t.Errorf("resolveBindHost() = %q, want %q (BIND_ADDR=%q ADMIN_TOKEN=%q MOLECULE_ENV=%q)",
-					got, tc.want, tc.bindAddr, tc.adminToken, tc.molEnv)
-			}
-		})
-	}
-}
@@ -19,7 +19,6 @@ import (
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/imagewatch"
 	memwiring "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/wiring"
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/middleware"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
@@ -249,6 +248,19 @@ func main() {
 		})
 	}

+	// CP-mode orphan sweeper — SaaS counterpart to the Docker sweeper
+	// above. Re-issues cpProv.Stop for any workspace at status='removed'
+	// with a non-NULL instance_id, healing the deprovision split-write
+	// race documented in #2989: tenant marks status='removed' BEFORE
+	// calling CP DELETE, so a transient CP failure leaves the EC2
+	// running with no retry path. cpProv.Stop is idempotent against
+	// already-terminated instances; on success we clear instance_id.
+	if cpProv != nil {
+		go supervised.RunWithRecover(ctx, "cp-orphan-sweeper", func(c context.Context) {
+			registry.StartCPOrphanSweeper(c, cpProv)
+		})
+	}
+
 	// Pending-uploads GC sweep — deletes acked rows past their retention
 	// window plus unacked rows past expires_at. Without this the
 	// pending_uploads table grows unbounded; even with the 24h hard TTL,
@@ -320,23 +332,15 @@ func main() {
 	// Router
 	r := router.Setup(hub, broadcaster, prov, platformURL, configsDir, wh, channelMgr, memBundle)

-	// HTTP server with graceful shutdown.
-	//
-	// Bind host: in dev-mode (no ADMIN_TOKEN, MOLECULE_ENV=dev|development)
-	// the AdminAuth chain fails open by design; pairing that with a wildcard
-	// bind would expose unauth /workspaces to any same-LAN peer. Default to
-	// loopback when fail-open is active. Operators who need LAN exposure set
-	// BIND_ADDR=0.0.0.0 explicitly. Production (ADMIN_TOKEN set) is unchanged.
-	// See molecule-core#7.
-	bindHost := resolveBindHost()
+	// HTTP server with graceful shutdown
 	srv := &http.Server{
-		Addr:    fmt.Sprintf("%s:%s", bindHost, port),
+		Addr:    fmt.Sprintf(":%s", port),
 		Handler: r,
 	}

 	// Start server in goroutine
 	go func() {
-		log.Printf("Platform starting on %s:%s (dev-mode-fail-open=%v)", bindHost, port, middleware.IsDevModeFailOpen())
+		log.Printf("Platform starting on :%s", port)
 		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 			log.Fatalf("Server failed: %v", err)
 		}
@@ -371,29 +375,6 @@ func envOr(key, fallback string) string {
 	return fallback
 }

-// resolveBindHost picks the listener interface for the HTTP server.
-//
-// Precedence:
-//  1. BIND_ADDR — explicit operator override (any value, including "0.0.0.0").
-//  2. dev-mode fail-open active → "127.0.0.1" (loopback only).
-//  3. otherwise → "" (Go binds every interface; existing prod/self-host shape).
-//
-// Coupling the loopback default to middleware.IsDevModeFailOpen() means the
-// two safety levers — bind narrowness and auth strength — move together. A
-// production deploy (ADMIN_TOKEN set) keeps binding to all interfaces because
-// the auth chain is doing its job; a dev Mac (no ADMIN_TOKEN, MOLECULE_ENV=dev)
-// is reachable only via loopback because the auth chain is fail-open. See
-// molecule-core#7 for the original LAN exposure finding.
-func resolveBindHost() string {
-	if v := os.Getenv("BIND_ADDR"); v != "" {
-		return v
-	}
-	if middleware.IsDevModeFailOpen() {
-		return "127.0.0.1"
-	}
-	return ""
-}
-
 func findConfigsDir() string {
 	candidates := []string{
 		"workspace-configs-templates",
@@ -413,56 +413,11 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		return http.StatusOK, respBody, nil
 	}

-	// Mock-runtime short-circuit. Workspaces with runtime='mock' have
-	// no container, no EC2, no URL — every reply is synthesised here
-	// from a small canned-variant pool. Built for the "200-workspace
-	// mock org" demo: a CEO/VPs/Managers/ICs hierarchy that renders
-	// at scale on the canvas without burning real LLM credits or
-	// provisioning 200 EC2 instances. See mock_runtime.go for the
-	// full rationale + reply shape contract.
-	//
-	// Position: AFTER poll-mode (mock isn't a delivery mode, it's a
-	// runtime; treating poll-set-on-mock as poll matches operator
-	// intent if anyone ever does that), BEFORE resolveAgentURL (mock
-	// has no URL — going through resolveAgentURL would 404 on the
-	// SELECT url since the row is provisioned as NULL).
-	if status, respBody, handled := h.handleMockA2A(ctx, workspaceID, callerID, body, a2aMethod, logActivity); handled {
-		return status, respBody, nil
-	}
-
 	agentURL, proxyErr := h.resolveAgentURL(ctx, workspaceID)
 	if proxyErr != nil {
 		return 0, nil, proxyErr
 	}

-	// Pre-flight container-health check (#36). The dispatchA2A path below
-	// does Docker-DNS forwarding to `ws-<wsShort>:8000` and only catches a
-	// missing/dead container REACTIVELY via maybeMarkContainerDead in
-	// handleA2ADispatchError. That works but costs the caller a full
-	// network-timeout (2-30s) before the structured 503 surfaces.
-	//
-	// When we KNOW the workspace is container-backed (h.docker != nil + we
-	// rewrite to Docker-DNS form below), do a single proactive
-	// RunningContainerName lookup. If the container is genuinely missing,
-	// short-circuit with the same structured 503 + async restart that
-	// maybeMarkContainerDead would produce — but immediately, without the
-	// network round-trip.
-	//
-	// Three outcomes of provisioner.RunningContainerName(ctx, h.docker, id):
-	//   ("ws-<id>", nil) → forward as today.
-	//   ("",        nil) → container is genuinely not running. Fast-503.
-	//   ("",        err) → transient daemon error. Fall through to optimistic
-	//                       forward — matches Provisioner.IsRunning's
-	//                       (true, err) "fail-soft as alive" contract.
-	//
-	// Same SSOT as findRunningContainer (#10/#12). See AST gate
-	// TestProxyA2A_RoutesThroughProvisionerSSOT.
-	if h.provisioner != nil && platformInDocker && strings.HasPrefix(agentURL, "http://"+provisioner.ContainerName(workspaceID)+":") {
-		if proxyErr := h.preflightContainerHealth(ctx, workspaceID); proxyErr != nil {
-			return 0, nil, proxyErr
-		}
-	}
-
 	startTime := time.Now()
 	resp, cancelFwd, err := h.dispatchA2A(ctx, workspaceID, agentURL, body, callerID)
 	if cancelFwd != nil {
@@ -198,60 +198,6 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
 	return true
 }

-// preflightContainerHealth runs a proactive Provisioner.IsRunning check
-// (#36) before dispatching the a2a forward. Routed through provisioner's
-// SSOT IsRunning, which itself wraps RunningContainerName — same source
-// as findRunningContainer in the plugins handler (#10/#12).
-//
-// Returns nil when the forward should proceed:
-//   - container is running, OR
-//   - daemon errored transiently (matches IsRunning's (true, err)
-//     "fail-soft as alive" contract — let the optimistic forward run
-//     and reactive maybeMarkContainerDead catch a real failure).
-//
-// Returns a structured 503 + triggers the same async restart that
-// maybeMarkContainerDead would produce, when:
-//   - container is genuinely not running (NotFound / Exited / Created…).
-//
-// The point of running this BEFORE the forward is to save the caller
-// 2-30s of network-timeout cost when the container is missing — a common
-// shape post-EC2-replace (see molecule-controlplane#20 incident
-// 2026-05-07) where the reconciler hasn't respawned the agent yet.
-func (h *WorkspaceHandler) preflightContainerHealth(ctx context.Context, workspaceID string) *proxyA2AError {
-	running, err := h.provisioner.IsRunning(ctx, workspaceID)
-	if err != nil {
-		// Transient daemon error. Provisioner.IsRunning returns (true, err)
-		// in this case — fall through to the optimistic forward, reactive
-		// maybeMarkContainerDead handles a real failure later.
-		log.Printf("ProxyA2A preflight: IsRunning transient error for %s: %v (proceeding with forward)", workspaceID, err)
-		return nil
-	}
-	if running {
-		// Container is running — forward as today.
-		return nil
-	}
-	// Container is genuinely not running. Mark offline + trigger restart
-	// (same effect as maybeMarkContainerDead's branch), and return the
-	// structured 503 immediately so the caller skips the forward.
-	log.Printf("ProxyA2A preflight: container for %s is not running — marking offline and triggering restart (#36)", workspaceID)
-	if _, dbErr := db.DB.ExecContext(ctx,
-		`UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2 AND status NOT IN ('removed', 'provisioning')`,
-		models.StatusOffline, workspaceID); dbErr != nil {
-		log.Printf("ProxyA2A preflight: failed to mark workspace %s offline: %v", workspaceID, dbErr)
-	}
-	db.ClearWorkspaceKeys(ctx, workspaceID)
-	h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceOffline), workspaceID, map[string]interface{}{})
-	go h.RestartByID(workspaceID)
-	return &proxyA2AError{
-		Status: http.StatusServiceUnavailable,
-		Response: gin.H{
-			"error":      "workspace container not running — restart triggered",
-			"restarting": true,
-			"preflight":  true, // distinguishes from reactive containerDead path
-		},
-	}
-}
-
 // logA2AFailure records a failed A2A attempt to activity_logs in a detached
 // goroutine (the request context may already be done by the time it runs).
 func (h *WorkspaceHandler) logA2AFailure(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, err error, durationMs int) {
@@ -1,194 +0,0 @@
-package handlers
-
-import (
-	"context"
-	"errors"
-	"go/ast"
-	"go/parser"
-	"go/token"
-	"testing"
-
-	"github.com/DATA-DOG/go-sqlmock"
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
-)
-
-// preflightLocalProv is a controllable LocalProvisionerAPI stub for the
-// preflight tests (#36). Other API methods panic to guard against tests
-// that should be using a different stub.
-type preflightLocalProv struct {
-	running    bool
-	err        error
-	calls      int
-	calledWith []string
-}
-
-func (p *preflightLocalProv) IsRunning(_ context.Context, workspaceID string) (bool, error) {
-	p.calls++
-	p.calledWith = append(p.calledWith, workspaceID)
-	return p.running, p.err
-}
-func (p *preflightLocalProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
-	panic("preflightLocalProv: Start not implemented")
-}
-func (p *preflightLocalProv) Stop(_ context.Context, _ string) error {
-	panic("preflightLocalProv: Stop not implemented")
-}
-func (p *preflightLocalProv) ExecRead(_ context.Context, _, _ string) ([]byte, error) {
-	panic("preflightLocalProv: ExecRead not implemented")
-}
-func (p *preflightLocalProv) RemoveVolume(_ context.Context, _ string) error {
-	panic("preflightLocalProv: RemoveVolume not implemented")
-}
-func (p *preflightLocalProv) VolumeHasFile(_ context.Context, _, _ string) (bool, error) {
-	panic("preflightLocalProv: VolumeHasFile not implemented")
-}
-func (p *preflightLocalProv) WriteAuthTokenToVolume(_ context.Context, _, _ string) error {
-	panic("preflightLocalProv: WriteAuthTokenToVolume not implemented")
-}
-
-// TestPreflight_ContainerRunning_ReturnsNil — IsRunning(true,nil): forward
-// proceeds. preflight returns nil → caller continues to dispatchA2A.
-func TestPreflight_ContainerRunning_ReturnsNil(t *testing.T) {
-	_ = setupTestDB(t)
-	stub := &preflightLocalProv{running: true, err: nil}
-	h := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
-	h.provisioner = stub
-
-	if err := h.preflightContainerHealth(context.Background(), "ws-running-123"); err != nil {
-		t.Fatalf("preflight should return nil when container running, got %+v", err)
-	}
-	if stub.calls != 1 {
-		t.Errorf("IsRunning should be called exactly once, got %d", stub.calls)
-	}
-	if len(stub.calledWith) != 1 || stub.calledWith[0] != "ws-running-123" {
-		t.Errorf("IsRunning should be called with workspace id, got %v", stub.calledWith)
-	}
-}
-
-// TestPreflight_ContainerNotRunning_StructuredFastFail — IsRunning(false,nil):
-// preflight returns structured 503 with restarting=true + preflight=true, AND
-// triggers the offline-flip + WORKSPACE_OFFLINE broadcast + async restart.
-// This is the load-bearing case — saves the caller 2-30s of network timeout.
-func TestPreflight_ContainerNotRunning_StructuredFastFail(t *testing.T) {
-	mock := setupTestDB(t)
-	_ = setupTestRedis(t)
-	stub := &preflightLocalProv{running: false, err: nil}
-	h := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
-	h.provisioner = stub
-
-	// Expect the offline-flip UPDATE.
-	mock.ExpectExec(`UPDATE workspaces SET status =`).
-		WithArgs(models.StatusOffline, "ws-dead-456").
-		WillReturnResult(sqlmock.NewResult(0, 1))
-	// Broadcaster's INSERT INTO structure_events fires too — best-effort
-	// log entry for the WORKSPACE_OFFLINE event. Match permissively.
-	mock.ExpectExec(`INSERT INTO structure_events`).
-		WillReturnResult(sqlmock.NewResult(0, 1))
-
-	proxyErr := h.preflightContainerHealth(context.Background(), "ws-dead-456")
-	if proxyErr == nil {
-		t.Fatal("preflight should return *proxyA2AError when container not running")
-	}
-	if proxyErr.Status != 503 {
-		t.Errorf("expected 503, got %d", proxyErr.Status)
-	}
-	if got := proxyErr.Response["restarting"]; got != true {
-		t.Errorf("response should mark restarting=true, got %v", got)
-	}
-	if got := proxyErr.Response["preflight"]; got != true {
-		t.Errorf("response should mark preflight=true so callers can distinguish from reactive containerDead, got %v", got)
-	}
-	if got := proxyErr.Response["error"]; got != "workspace container not running — restart triggered" {
-		t.Errorf("error message mismatch, got %q", got)
-	}
-
-	// Note: broadcaster firing is exercised by the production path's
-	// h.broadcaster.RecordAndBroadcast call but not asserted here — the
-	// real *events.Broadcaster doesn't expose received events for inspection.
-	// The DB UPDATE expectation is sufficient to pin the offline-flip path.
-}
-
-// TestPreflight_TransientError_FailsSoftAsAlive — IsRunning(true,err): the
-// (true, err) "fail-soft" contract — preflight returns nil so the optimistic
-// forward runs; reactive maybeMarkContainerDead handles a real failure later.
-// This pin is critical: a flaky daemon must NOT trigger a restart cascade.
-func TestPreflight_TransientError_FailsSoftAsAlive(t *testing.T) {
-	_ = setupTestDB(t)
-	stub := &preflightLocalProv{running: true, err: errors.New("docker daemon EOF")}
-	h := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
-	h.provisioner = stub
-
-	if err := h.preflightContainerHealth(context.Background(), "ws-flaky-789"); err != nil {
-		t.Fatalf("preflight should return nil on transient error (fail-soft), got %+v", err)
-	}
-	// No DB UPDATE expected — sqlmock would complain about unexpected calls
-	// at test cleanup if the offline-flip path fired.
-}
-
-// TestProxyA2A_Preflight_RoutesThroughProvisionerSSOT — AST gate (#36 mirror
-// of #12's gate). Pins the invariant that preflightContainerHealth uses the
-// SSOT Provisioner.IsRunning helper, NOT a parallel docker.ContainerInspect
-// of its own.
-//
-// Mutation invariant: if a future PR replaces h.provisioner.IsRunning with
-// a direct cli.ContainerInspect call, this test fails. That's the signal to
-// either (a) extend Provisioner.IsRunning's contract OR (b) document why
-// this call site needs to differ. Either way, the drift gets a reviewer's
-// attention instead of shipping silently.
-func TestProxyA2A_Preflight_RoutesThroughProvisionerSSOT(t *testing.T) {
-	fset := token.NewFileSet()
-	file, err := parser.ParseFile(fset, "a2a_proxy_helpers.go", nil, parser.ParseComments)
-	if err != nil {
-		t.Fatalf("parse a2a_proxy_helpers.go: %v", err)
-	}
-
-	var fn *ast.FuncDecl
-	ast.Inspect(file, func(n ast.Node) bool {
-		f, ok := n.(*ast.FuncDecl)
-		if !ok || f.Name.Name != "preflightContainerHealth" {
-			return true
-		}
-		fn = f
-		return false
-	})
-	if fn == nil {
-		t.Fatal("preflightContainerHealth not found — was it renamed? update this gate or the SSOT routing assumption")
-	}
-
-	var (
-		callsIsRunning             bool
-		callsContainerInspectRaw   bool
-		callsRunningContainerNameDirect bool
-	)
-	ast.Inspect(fn.Body, func(n ast.Node) bool {
-		call, ok := n.(*ast.CallExpr)
-		if !ok {
-			return true
-		}
-		sel, ok := call.Fun.(*ast.SelectorExpr)
-		if !ok {
-			return true
-		}
-		switch sel.Sel.Name {
-		case "IsRunning":
-			callsIsRunning = true
-		case "ContainerInspect":
-			callsContainerInspectRaw = true
-		case "RunningContainerName":
-			// Direct RunningContainerName is also acceptable SSOT — but
-			// preferring IsRunning keeps the (bool, error) contract that
-			// already exists in the helper API surface.
-			callsRunningContainerNameDirect = true
-		}
-		return true
-	})
-
-	if !callsIsRunning && !callsRunningContainerNameDirect {
-		t.Errorf("preflightContainerHealth must call provisioner.IsRunning OR provisioner.RunningContainerName for the SSOT health check — see molecule-core#36. Found neither.")
-	}
-	if callsContainerInspectRaw {
-		t.Errorf("preflightContainerHealth carries a direct ContainerInspect call. This is the parallel-impl drift molecule-core#36 fixed. " +
-			"Either route through provisioner.IsRunning OR — if a new use case truly needs a different inspect — extend the helper's contract first and update this gate to allow the specific delta.")
-	}
-}
@@ -108,18 +108,6 @@ type eicTunnelPool struct {
 	// First acquirer takes the slot; later ones wait on the channel.
 	pendingSetups map[string]chan struct{}
 	stopJanitor   chan struct{}
-	// janitorInterval is captured at pool construction from the
-	// package-level poolJanitorInterval var. Captured (not re-read on
-	// every tick) so a test that swaps the package var via t.Cleanup
-	// after a global pool's janitor is already running can't race
-	// with that goroutine's ticker read. The global pool is created
-	// lazily once per process via sync.Once; before this capture
-	// landed, every test that touched poolJanitorInterval after the
-	// global pool's first-touch raced the janitor (caught by -race
-	// on staging tip 249dbc6a — TestPooledWithEICTunnel_PanicPoisonsEntry).
-	// Tests still get the new value on a freshPool() because they
-	// set the package var BEFORE calling newEICTunnelPool().
-	janitorInterval time.Duration
 }

 var (
@@ -139,16 +127,11 @@ func getEICTunnelPool() *eicTunnelPool {

 // newEICTunnelPool constructs an empty pool. Exported so tests can
 // build isolated pools without sharing the singleton.
-//
-// Captures poolJanitorInterval at construction time so the janitor
-// goroutine doesn't race with t.Cleanup-driven swaps of the package
-// var. See the janitorInterval field comment for the failure mode.
 func newEICTunnelPool() *eicTunnelPool {
 	return &eicTunnelPool{
-		entries:         map[string]*pooledTunnel{},
-		pendingSetups:   map[string]chan struct{}{},
-		stopJanitor:     make(chan struct{}),
-		janitorInterval: poolJanitorInterval,
+		entries:       map[string]*pooledTunnel{},
+		pendingSetups: map[string]chan struct{}{},
+		stopJanitor:   make(chan struct{}),
 	}
 }

@@ -307,11 +290,8 @@ func (p *eicTunnelPool) evictLRUIfFullLocked(skipInstance string) {
 // janitor periodically scans for entries that are idle AND expired,
 // closing their tunnels. Runs forever (per pool lifetime); cancelled
 // by close(p.stopJanitor) for tests that build short-lived pools.
-//
-// Reads p.janitorInterval (captured at construction) instead of the
-// package-level poolJanitorInterval — see janitorInterval field comment.
 func (p *eicTunnelPool) janitor() {
-	t := time.NewTicker(p.janitorInterval)
+	t := time.NewTicker(poolJanitorInterval)
 	defer t.Stop()
 	for {
 		select {
@@ -0,0 +1,136 @@
+package handlers
+
+// eic_tunnel_pool_setup.go — production setup shim.
+//
+// setupRealEICTunnel decomposes the existing realWithEICTunnel into
+// its slow half (build the tunnel) and its caller half (run fn). The
+// pool calls the slow half once and shares the resulting session
+// across N callers, holding cleanup until the last release.
+//
+// Why decompose instead of refactoring realWithEICTunnel: the
+// existing function and its test stub-vars (withEICTunnel,
+// sendSSHPublicKey, openTunnelCmd) are load-bearing for the
+// dispatch tests. Extracting a sibling setup function preserves the
+// existing single-shot path verbatim — the pool wraps it by calling
+// realWithEICTunnel through a thin adapter, leaving the tested
+// surface unchanged.
+//
+// The pool's acquire() invokes poolSetupTunnel, which is a `var`
+// pointing to setupRealEICTunnel for production and a counting stub
+// for tests.
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"time"
+)
+
+// setupRealEICTunnel is the slow path that the pool consumes when
+// no warm entry exists. Mirrors realWithEICTunnel's setup half but
+// returns the session + cleanup instead of running fn inline.
+//
+// The cleanup func owns the tunnel subprocess, ephemeral key dir,
+// and a one-time wait. Idempotent — calling it twice is safe; the
+// pool guarantees one call per session, but defence-in-depth helps
+// when tests run pools in parallel and racy sweeps re-trigger.
+func setupRealEICTunnel(ctx context.Context, instanceID string) (
+	eicSSHSession, func(), error) {
+
+	if instanceID == "" {
+		return eicSSHSession{}, nil,
+			fmt.Errorf("workspace has no instance_id — not a SaaS EC2 workspace")
+	}
+	osUser := os.Getenv("WORKSPACE_EC2_OS_USER")
+	if osUser == "" {
+		osUser = "ubuntu"
+	}
+	region := os.Getenv("AWS_REGION")
+	if region == "" {
+		region = "us-east-2"
+	}
+
+	keyDir, err := os.MkdirTemp("", "molecule-eic-pool-*")
+	if err != nil {
+		return eicSSHSession{}, nil, fmt.Errorf("keydir mkdir: %w", err)
+	}
+	keyPath := keyDir + "/id"
+	if out, kerr := exec.CommandContext(ctx, "ssh-keygen",
+		"-t", "ed25519", "-f", keyPath, "-N", "", "-q",
+		"-C", "molecule-eic-pool",
+	).CombinedOutput(); kerr != nil {
+		_ = os.RemoveAll(keyDir)
+		return eicSSHSession{}, nil,
+			fmt.Errorf("ssh-keygen: %w (%s)", kerr, strings.TrimSpace(string(out)))
+	}
+	pubKey, err := os.ReadFile(keyPath + ".pub")
+	if err != nil {
+		_ = os.RemoveAll(keyDir)
+		return eicSSHSession{}, nil, fmt.Errorf("read pubkey: %w", err)
+	}
+
+	if err := sendSSHPublicKey(ctx, region, instanceID, osUser,
+		strings.TrimSpace(string(pubKey))); err != nil {
+		_ = os.RemoveAll(keyDir)
+		return eicSSHSession{}, nil, fmt.Errorf("send-ssh-public-key: %w", err)
+	}
+
+	localPort, err := pickFreePort()
+	if err != nil {
+		_ = os.RemoveAll(keyDir)
+		return eicSSHSession{}, nil, fmt.Errorf("pick free port: %w", err)
+	}
+
+	tunnel := openTunnelCmd(eicSSHOptions{
+		InstanceID:     instanceID,
+		OSUser:         osUser,
+		Region:         region,
+		LocalPort:      localPort,
+		PrivateKeyPath: keyPath,
+	})
+	tunnel.Env = os.Environ()
+	if err := tunnel.Start(); err != nil {
+		_ = os.RemoveAll(keyDir)
+		return eicSSHSession{}, nil, fmt.Errorf("open-tunnel start: %w", err)
+	}
+
+	if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil {
+		if tunnel.Process != nil {
+			_ = tunnel.Process.Kill()
+		}
+		_ = tunnel.Wait()
+		_ = os.RemoveAll(keyDir)
+		return eicSSHSession{}, nil, fmt.Errorf("tunnel never listened: %w", err)
+	}
+
+	cleanedUp := false
+	cleanup := func() {
+		if cleanedUp {
+			return
+		}
+		cleanedUp = true
+		if tunnel.Process != nil {
+			_ = tunnel.Process.Kill()
+		}
+		_ = tunnel.Wait()
+		_ = os.RemoveAll(keyDir)
+	}
+
+	return eicSSHSession{
+		keyPath:    keyPath,
+		localPort:  localPort,
+		osUser:     osUser,
+		instanceID: instanceID,
+	}, cleanup, nil
+}
+
+// init wires the pool into the package-level withEICTunnel var so
+// every read/write/list/delete EIC op uses pooled tunnels by default.
+// Test files that need single-shot behaviour can swap withEICTunnel
+// back via the existing stubWithEICTunnel pattern, OR set poolTTL=0
+// to disable pooling without rebinding the var.
+func init() {
+	initEICTunnelPool()
+}
@@ -1,223 +0,0 @@
-package handlers
-
-// mock_runtime.go — "mock" runtime: a virtual workspace that has no
-// container, no EC2, no LLM, just hardcoded canned A2A replies. Built
-// for the funding-demo "200-workspace mock org" so hongming can show
-// investors a CEO/VPs/Managers/ICs hierarchy at scale without burning
-// 200 EC2 instances or 200 Anthropic keys.
-//
-// Wire model:
-//   - org template declares `runtime: mock` on every workspace
-//   - createWorkspaceTree skips provisioning, sets status='online'
-//     directly (mirrors the `external` short-circuit, minus the URL +
-//     awaiting_agent dance)
-//   - proxyA2ARequest short-circuits on a mock-runtime target and
-//     returns a canned JSON-RPC reply; never calls resolveAgentURL,
-//     never opens an HTTP connection, never touches Docker/EC2
-//
-// The reply is JSON-RPC 2.0 + a2a-sdk v0.3 shape so the canvas's
-// extractAgentText / extractTextsFromParts read it without any
-// special-casing. We rotate over a small variant pool so a screen
-// full of replies doesn't all read identical — gives the demo a bit
-// of life without pretending to be a real agent.
-
-import (
-	"context"
-	"crypto/sha1"
-	"database/sql"
-	"encoding/binary"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"log"
-	"net/http"
-	"strings"
-	"time"
-
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
-	"github.com/gin-gonic/gin"
-	"github.com/google/uuid"
-)
-
-// MockRuntimeName is the canonical runtime string a workspace row
-// carries to opt into the canned-reply short-circuit. Kept as a const
-// so the proxy's runtime-check + the org-import skip-block reference
-// the same literal.
-const MockRuntimeName = "mock"
-
-// mockReplyVariants is the pool of canned strings the mock runtime
-// rotates through. Picked to read like a busy-but-short reply from a
-// real human in a hierarchy — a CEO would NOT respond with "On it!",
-// but for the demo every node is shown to be reachable, so we lean
-// into the variety. Variant selection is deterministic per
-// (workspaceID, request-id) pair so a screen recording replays the
-// same reply for the same input.
-var mockReplyVariants = []string{
-	"On it!",
-	"Got it, on it now.",
-	"On it, boss.",
-	"Working on it.",
-	"Acknowledged — on it.",
-	"On it, will report back.",
-	"Roger that, on it.",
-	"Copy that. On it.",
-	"On it — ETA shortly.",
-	"On it. Standby for update.",
-}
-
-// pickMockReply returns a canned reply for the given workspaceID +
-// requestID. Deterministic so the same (workspace, message-id) pair
-// always picks the same variant — useful for screen recordings and
-// flake-free e2e snapshots. Falls back to variant[0] if the inputs
-// are empty.
-func pickMockReply(workspaceID, requestID string) string {
-	if len(mockReplyVariants) == 0 {
-		return "On it!"
-	}
-	if workspaceID == "" && requestID == "" {
-		return mockReplyVariants[0]
-	}
-	h := sha1.Sum([]byte(workspaceID + ":" + requestID))
-	idx := int(binary.BigEndian.Uint32(h[0:4]) % uint32(len(mockReplyVariants)))
-	return mockReplyVariants[idx]
-}
-
-// lookupRuntime returns the workspace's runtime string. Empty when the
-// row is missing / DB hiccup so callers fall through to the existing
-// dispatch path (which will then 404 / 502 normally). Fail-open here
-// because a transient DB error must not silently flip a real workspace
-// into mock-mode and start handing out canned replies in place of
-// genuine agent traffic.
-func lookupRuntime(ctx context.Context, workspaceID string) string {
-	var runtime sql.NullString
-	err := db.DB.QueryRowContext(ctx,
-		`SELECT runtime FROM workspaces WHERE id = $1`, workspaceID,
-	).Scan(&runtime)
-	if err != nil {
-		if !errors.Is(err, sql.ErrNoRows) {
-			log.Printf("ProxyA2A: lookupRuntime(%s) failed (%v) — falling through to dispatch path", workspaceID, err)
-		}
-		return ""
-	}
-	if !runtime.Valid {
-		return ""
-	}
-	return runtime.String
-}
-
-// buildMockA2AResponse synthesises a JSON-RPC 2.0 success envelope that
-// matches the a2a-sdk v0.3 reply shape the canvas's extractAgentText
-// already understands: `{result: {parts: [{kind: "text", text: ...}]}}`.
-// `requestID` is the JSON-RPC `id` of the inbound request — A2A
-// implementations echo it on the reply so callers can correlate. We
-// extract it from the normalized payload in the caller and pass it in
-// here so this function stays JSON-only (no payload parsing).
-//
-// Returns marshalled bytes ready to write straight to the HTTP body.
-// Marshal failure is logged + a tiny fallback envelope returned, since
-// failing the whole request because of a JSON encoding hiccup on a
-// constant-shaped payload would defeat the "mock always works" guarantee.
-func buildMockA2AResponse(workspaceID, requestID, replyText string) []byte {
-	if requestID == "" {
-		requestID = uuid.New().String()
-	}
-	envelope := map[string]any{
-		"jsonrpc": "2.0",
-		"id":      requestID,
-		"result": map[string]any{
-			"parts": []map[string]any{
-				{"kind": "text", "text": replyText},
-			},
-		},
-	}
-	out, err := json.Marshal(envelope)
-	if err != nil {
-		log.Printf("ProxyA2A: mock-runtime response marshal failed for %s: %v — emitting fallback", workspaceID, err)
-		// Hand-rolled minimal envelope. Safe because every value is a
-		// hardcoded constant string with no characters that need
-		// escaping in a JSON string literal.
-		fallback := fmt.Sprintf(
-			`{"jsonrpc":"2.0","id":%q,"result":{"parts":[{"kind":"text","text":%q}]}}`,
-			requestID, replyText,
-		)
-		return []byte(fallback)
-	}
-	return out
-}
-
-// extractRequestID pulls the JSON-RPC `id` out of an already-normalized
-// A2A payload. Returns "" when the field is absent or not a string —
-// caller substitutes a fresh UUID. Tolerant of every shape
-// normalizeA2APayload could produce.
-func extractRequestID(body []byte) string {
-	var top map[string]json.RawMessage
-	if err := json.Unmarshal(body, &top); err != nil {
-		return ""
-	}
-	raw, ok := top["id"]
-	if !ok {
-		return ""
-	}
-	var s string
-	if json.Unmarshal(raw, &s) == nil {
-		return s
-	}
-	// JSON-RPC permits numeric IDs too; canvas issues UUIDs but be
-	// defensive against alternative SDKs.
-	var n json.Number
-	if json.Unmarshal(raw, &n) == nil {
-		return n.String()
-	}
-	return ""
-}
-
-// handleMockA2A is the proxy short-circuit for mock-runtime workspaces.
-// Returns (status, body, true) when the target is mock — caller writes
-// the response and returns. Returns (_, _, false) when the target is
-// not mock — caller continues to the real dispatch path.
-//
-// Side-effects: writes a synthetic activity_logs row via logA2ASuccess
-// when logActivity is true so the canvas's "Agent Comms" tab shows the
-// mock reply in the trace alongside real-agent traffic. Without this
-// the demo would render messages on the canvas chat panel but a peer
-// node clicking through to its activity tab would see an empty list.
-func (h *WorkspaceHandler) handleMockA2A(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, logActivity bool) (int, []byte, bool) {
-	if lookupRuntime(ctx, workspaceID) != MockRuntimeName {
-		return 0, nil, false
-	}
-	requestID := extractRequestID(body)
-	replyText := pickMockReply(workspaceID, requestID)
-	respBody := buildMockA2AResponse(workspaceID, requestID, replyText)
-
-	// Tiny artificial delay so the canvas chat UI has time to render
-	// the user's outgoing bubble before the agent reply appears.
-	// Without it the reply lands the same animation frame and feels
-	// robotic. 80ms is too fast to look "real" but masks the React
-	// double-render race that drops the user bubble entirely on slow
-	// machines (observed locally on M1 Air, 2026-05-07). Below 200ms
-	// keeps a 200-node demo snappy when investors fan out 30 messages
-	// at once.
-	time.Sleep(80 * time.Millisecond)
-
-	if logActivity {
-		// Reuse the existing success-logger so the activity feed shape
-		// is identical to a real agent reply. Status 200 + duration 0
-		// is the "synthesised reply" marker; activity_logs.duration_ms
-		// being 0 is harmless (real fast paths can hit 0 too).
-		h.logA2ASuccess(ctx, workspaceID, callerID, body, respBody, a2aMethod, http.StatusOK, 0)
-	}
-	return http.StatusOK, respBody, true
-}
-
-// IsMockRuntime is a small public helper for callers outside this
-// package (tests, the org importer) that need to ask the question
-// without depending on the unexported constant. Trims + lower-cases
-// so a typoed YAML cell like "  Mock " still resolves correctly.
-func IsMockRuntime(runtime string) bool {
-	return strings.EqualFold(strings.TrimSpace(runtime), MockRuntimeName)
-}
-
-// gin import is unused at file scope but kept as a tag so a future
-// addition of a thin HTTP handler (e.g. POST /workspaces/:id/mock/replies
-// for an admin-set custom reply pool) doesn't need an import re-order.
-var _ = gin.H{}
@@ -1,266 +0,0 @@
-package handlers
-
-// mock_runtime_test.go — locks the contract for the mock-runtime
-// short-circuit added for the funding-demo "200-workspace mock org"
-// template. Three invariants:
-//
-//   1. ProxyA2A on a workspace with runtime='mock' must return 200
-//      with a JSON-RPC reply containing one text part. NO HTTP
-//      dispatch, NO resolveAgentURL DB read (mock workspaces have
-//      no URL — that read would 404 and break the demo).
-//
-//   2. The reply text must be one of the canned variants and must be
-//      deterministic for a given (workspace_id, request_id) pair so
-//      screen recordings replay identically.
-//
-//   3. Workspaces with runtime != 'mock' must NOT be affected — the
-//      mock check fails fast and falls through to the existing
-//      dispatch path. Same kind of regression guard the poll-mode
-//      tests carry.
-
-import (
-	"bytes"
-	"encoding/json"
-	"net/http"
-	"net/http/httptest"
-	"testing"
-	"time"
-
-	"github.com/DATA-DOG/go-sqlmock"
-	"github.com/gin-gonic/gin"
-)
-
-// TestProxyA2A_MockRuntime_ReturnsCannedReply is the happy-path
-// contract. A workspace flagged runtime='mock' must:
-//   - return 200 with JSON-RPC envelope {result:{parts:[{kind:text,text:...}]}}
-//   - not dispatch HTTP (no SELECT url SQL expected)
-//   - reply text is one of mockReplyVariants
-func TestProxyA2A_MockRuntime_ReturnsCannedReply(t *testing.T) {
-	mock := setupTestDB(t)
-	setupTestRedis(t)
-	broadcaster := newTestBroadcaster()
-	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
-
-	const wsID = "ws-mock-canned"
-
-	// Budget check fires before runtime lookup (same as the poll-mode
-	// short-circuit) — keeps mock workspaces honest if a tenant ever
-	// sets a budget on one. Unlikely on a demo, but the guard stays
-	// uniform so future "monthly_spend on mock = 0" assertions don't
-	// drift.
-	expectBudgetCheck(mock, wsID)
-
-	// lookupDeliveryMode runs first — return push so the poll
-	// short-circuit doesn't fire and we hit the mock check.
-	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
-		WithArgs(wsID).
-		WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("push"))
-
-	// lookupRuntime SELECT — returns 'mock', triggering the canned-reply
-	// short-circuit. CRITICAL: NO ExpectQuery for `SELECT url, status
-	// FROM workspaces` (resolveAgentURL's query). If the short-circuit
-	// fails to fire, sqlmock will surface "unexpected query" on the URL
-	// SELECT and the test fails loudly — that's the dispatch-leak detector.
-	mock.ExpectQuery("SELECT runtime FROM workspaces WHERE id").
-		WithArgs(wsID).
-		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("mock"))
-
-	// Activity log: logA2ASuccess writes the synthetic reply to
-	// activity_logs so the canvas's Agent Comms tab shows it alongside
-	// real-agent traffic.
-	mock.ExpectExec("INSERT INTO activity_logs").
-		WillReturnResult(sqlmock.NewResult(0, 1))
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: wsID}}
-
-	body := `{"jsonrpc":"2.0","id":"req-mock-1","method":"message/send","params":{"message":{"role":"user","parts":[{"kind":"text","text":"hello mock"}]}}}`
-	c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body))
-	c.Request.Header.Set("Content-Type", "application/json")
-
-	handler.ProxyA2A(c)
-
-	// logA2ASuccess fires async — give it a moment to settle so
-	// ExpectationsWereMet doesn't flake.
-	time.Sleep(200 * time.Millisecond)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-	var resp map[string]interface{}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("response is not valid JSON: %v", err)
-	}
-	if resp["jsonrpc"] != "2.0" {
-		t.Errorf("response.jsonrpc = %v, want 2.0", resp["jsonrpc"])
-	}
-	if resp["id"] != "req-mock-1" {
-		t.Errorf("response.id = %v, want %q (echoed from request)", resp["id"], "req-mock-1")
-	}
-	result, _ := resp["result"].(map[string]interface{})
-	if result == nil {
-		t.Fatalf("response.result missing or wrong type: %v", resp["result"])
-	}
-	parts, _ := result["parts"].([]interface{})
-	if len(parts) != 1 {
-		t.Fatalf("expected exactly one part, got %d: %v", len(parts), parts)
-	}
-	part, _ := parts[0].(map[string]interface{})
-	if part["kind"] != "text" {
-		t.Errorf("part.kind = %v, want text", part["kind"])
-	}
-	text, _ := part["text"].(string)
-	if text == "" {
-		t.Error("part.text is empty — canned reply not populated")
-	}
-	// Reply must be one of the variants.
-	matched := false
-	for _, v := range mockReplyVariants {
-		if v == text {
-			matched = true
-			break
-		}
-	}
-	if !matched {
-		t.Errorf("reply text %q is not in mockReplyVariants", text)
-	}
-
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-// TestProxyA2A_NonMockRuntime_NoShortCircuit verifies the symmetric
-// contract: a workspace with a real runtime (claude-code, hermes, etc.)
-// must NOT be affected by the mock check — it falls through to the
-// real dispatch path. Without this guard, a regression in
-// lookupRuntime could silently flip every workspace into mock-mode
-// and start handing out canned replies in place of real-agent traffic.
-func TestProxyA2A_NonMockRuntime_NoShortCircuit(t *testing.T) {
-	mock := setupTestDB(t)
-	mr := setupTestRedis(t)
-	allowLoopbackForTest(t)
-	broadcaster := newTestBroadcaster()
-	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
-
-	const wsID = "ws-real-runtime"
-
-	dispatched := false
-	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		dispatched = true
-		w.Header().Set("Content-Type", "application/json")
-		w.Write([]byte(`{"jsonrpc":"2.0","id":"1","result":{"status":"ok"}}`))
-	}))
-	defer agentServer.Close()
-	mr.Set("ws:"+wsID+":url", agentServer.URL)
-
-	expectBudgetCheck(mock, wsID)
-
-	// poll-mode SELECT — return push so we proceed past the poll
-	// short-circuit.
-	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
-		WithArgs(wsID).
-		WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("push"))
-
-	// runtime SELECT — return claude-code so the mock check falls
-	// through.
-	mock.ExpectQuery("SELECT runtime FROM workspaces WHERE id").
-		WithArgs(wsID).
-		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("claude-code"))
-
-	mock.ExpectExec("INSERT INTO activity_logs").
-		WillReturnResult(sqlmock.NewResult(0, 1))
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: wsID}}
-	body := `{"jsonrpc":"2.0","id":"real-1","method":"message/send","params":{"message":{"role":"user","parts":[{"kind":"text","text":"hi"}]}}}`
-	c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body))
-	c.Request.Header.Set("Content-Type", "application/json")
-
-	handler.ProxyA2A(c)
-
-	time.Sleep(50 * time.Millisecond)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-	if !dispatched {
-		t.Error("non-mock runtime: expected the agent server to receive the request, but it did not — mock short-circuit may be over-firing")
-	}
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-// TestPickMockReply_Deterministic locks the determinism contract:
-// the same (workspaceID, requestID) input must yield the same variant
-// every call. Required for screen recordings + flake-free e2e
-// snapshots.
-func TestPickMockReply_Deterministic(t *testing.T) {
-	cases := []struct {
-		ws, req string
-	}{
-		{"ws-1", "req-A"},
-		{"ws-1", "req-B"},
-		{"ws-2", "req-A"},
-		{"", ""},
-	}
-	for _, tc := range cases {
-		first := pickMockReply(tc.ws, tc.req)
-		for i := 0; i < 10; i++ {
-			next := pickMockReply(tc.ws, tc.req)
-			if next != first {
-				t.Errorf("pickMockReply(%q,%q) is not deterministic: got %q then %q",
-					tc.ws, tc.req, first, next)
-			}
-		}
-	}
-}
-
-// TestIsMockRuntime_TrimsAndCaseInsensitive — typos and stray
-// whitespace in YAML must still resolve to mock so a single
-// runtime: " Mock " entry doesn't silently get dispatched.
-func TestIsMockRuntime_TrimsAndCaseInsensitive(t *testing.T) {
-	cases := map[string]bool{
-		"mock":      true,
-		"MOCK":      true,
-		"  Mock  ":  true,
-		"mocky":     false,
-		"":          false,
-		"external":  false,
-		"claude-code": false,
-	}
-	for in, want := range cases {
-		if got := IsMockRuntime(in); got != want {
-			t.Errorf("IsMockRuntime(%q) = %v, want %v", in, got, want)
-		}
-	}
-}
-
-// TestBuildMockA2AResponse_EchoesRequestID — JSON-RPC requires the
-// reply id to match the request id so callers can correlate. Mock
-// must hold this contract or canvas's correlation logic breaks.
-func TestBuildMockA2AResponse_EchoesRequestID(t *testing.T) {
-	out := buildMockA2AResponse("ws-x", "req-echo-7", "On it!")
-	var resp map[string]interface{}
-	if err := json.Unmarshal(out, &resp); err != nil {
-		t.Fatalf("response is not valid JSON: %v", err)
-	}
-	if resp["id"] != "req-echo-7" {
-		t.Errorf("id = %v, want req-echo-7", resp["id"])
-	}
-	if resp["jsonrpc"] != "2.0" {
-		t.Errorf("jsonrpc = %v, want 2.0", resp["jsonrpc"])
-	}
-	result, _ := resp["result"].(map[string]interface{})
-	parts, _ := result["parts"].([]interface{})
-	if len(parts) != 1 {
-		t.Fatalf("expected 1 part, got %d", len(parts))
-	}
-	p, _ := parts[0].(map[string]interface{})
-	if p["text"] != "On it!" {
-		t.Errorf("part.text = %v, want On it!", p["text"])
-	}
-}
@@ -250,21 +250,6 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 		h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceOnline), id, map[string]interface{}{
 			"name": ws.Name, "external": true,
 		})
-	} else if IsMockRuntime(runtime) {
-		// Mock-runtime workspaces have no container, no EC2, no URL —
-		// the proxyA2ARequest short-circuit synthesises every reply
-		// from a canned variant pool (see mock_runtime.go). Status
-		// goes straight to 'online' so the canvas renders the node
-		// as reachable + the chat tab's send button is enabled. No
-		// URL is set; the proxy never tries to resolve one for mock
-		// runtimes. Built for the funding-demo "200-workspace mock
-		// org" template — visual scale without real backend cost.
-		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = $1 WHERE id = $2`, models.StatusOnline, id); err != nil {
-			log.Printf("Org import: mock workspace status update failed for %s: %v", ws.Name, err)
-		}
-		h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceOnline), id, map[string]interface{}{
-			"name": ws.Name, "mock": true, "runtime": runtime,
-		})
 	} else if h.workspace.HasProvisioner() {
 		// Provision container — either backend (CP for SaaS, local Docker
 		// for self-hosted) is fine. Pre-2026-05-05 this gate was
@@ -690,23 +675,7 @@ func (h *OrgHandler) recurseChildrenForImport(ws OrgWorkspace, parentID string,
 		if err := h.createWorkspaceTree(child, &parentID, childAbsX, childAbsY, slotX, slotY, defaults, orgBaseDir, results, provisionSem); err != nil {
 			return err
 		}
-		// Pacing exists to throttle Docker container-spawn thundering
-		// during a self-hosted import. Mock-runtime children spawn no
-		// container — no Docker pressure, no LLM bursts, just DB
-		// inserts + a broadcast. Skipping the 2s sleep collapses a
-		// 200-workspace mock-org import from ~7min → ~5s, which is
-		// the difference between a snappy demo and a "did it freeze?"
-		// staring contest. Real (containerful) runtimes still pace.
-		// Inheritance: if the child itself doesn't declare a runtime,
-		// fall back to defaults.runtime — the org template sets
-		// runtime: mock once at the org level, not on every IC node.
-		childRuntime := child.Runtime
-		if childRuntime == "" {
-			childRuntime = defaults.Runtime
-		}
-		if !IsMockRuntime(childRuntime) {
-			time.Sleep(workspaceCreatePacingMs * time.Millisecond)
-		}
+		time.Sleep(workspaceCreatePacingMs * time.Millisecond)
 	}
 	return nil
 }
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"context"
 	"io"
-	"log"
 	"os"
 	"path/filepath"
 	"strings"
@@ -178,42 +177,16 @@ func strDefault(m map[string]interface{}, key, fallback string) string {
 	return fallback
 }

-// findRunningContainer returns the live container name for workspaceID, or ""
-// when the container is genuinely not running OR the daemon errored
-// transiently. Routed through provisioner.RunningContainerName as the SSOT
-// (molecule-core#10) so this handler agrees with healthsweep on the same
-// inputs. Transient daemon errors are logged distinctly so triage doesn't
-// confuse a flaky daemon with a stopped container.
 func (h *PluginsHandler) findRunningContainer(ctx context.Context, workspaceID string) string {
-	name, err := provisioner.RunningContainerName(ctx, h.docker, workspaceID)
-	if err != nil {
-		log.Printf("plugins: docker inspect transient error for %s: %v (treating as not-running for this request)", workspaceID, err)
+	if h.docker == nil {
 		return ""
 	}
-	return name
-}
-
-// isExternalRuntime reports whether the workspace's runtime is the
-// `external` (remote-pull) shape introduced in Phase 30. External
-// workspaces have no local container — `POST /plugins` (push-install via
-// docker exec) doesn't apply to them; they pull via the download endpoint
-// instead. Returns false (allow-install) if the lookup is unwired or
-// errors — failing open here is safe because the downstream
-// findRunningContainer step still gates on a real container being there.
-//
-// Background — molecule-core#10: without this check, external workspaces
-// fall through to findRunningContainer's NotFound path and return a
-// misleading 503 "container not running" instead of a clear "use the
-// pull endpoint" message.
-func (h *PluginsHandler) isExternalRuntime(workspaceID string) bool {
-	if h.runtimeLookup == nil {
-		return false
+	name := provisioner.ContainerName(workspaceID)
+	info, err := h.docker.ContainerInspect(ctx, name)
+	if err == nil && info.State.Running {
+		return name
 	}
-	runtime, err := h.runtimeLookup(workspaceID)
-	if err != nil {
-		return false
-	}
-	return runtime == "external"
+	return ""
 }

 func (h *PluginsHandler) execAsRoot(ctx context.Context, containerName string, cmd []string) (string, error) {
@@ -1,176 +0,0 @@
-package handlers
-
-import (
-	"go/ast"
-	"go/parser"
-	"go/token"
-	"strings"
-	"testing"
-)
-
-// TestFindRunningContainer_RoutesThroughProvisionerSSOT is a behavior-based
-// AST gate: it pins the invariant that PluginsHandler.findRunningContainer
-// MUST go through provisioner.RunningContainerName for its is-running check,
-// instead of carrying its own copy of cli.ContainerInspect logic.
-//
-// Background — molecule-core#10: a parallel impl of "is the workspace's
-// container running" used to live in plugins.go. It drifted from the
-// canonical impl in healthsweep (which goes through Provisioner.IsRunning
-// → RunningContainerName) on edge cases like "transient daemon error" —
-// the duplicate would 503 with a misleading message while healthsweep
-// correctly stayed defensive. Consolidating onto RunningContainerName as
-// the SSOT prevents any future copy from re-introducing that drift.
-//
-// Mutation invariant: if a future PR replaces the provisioner call with
-// `h.docker.ContainerInspect(...)` directly, this test fails. That's the
-// signal to either (a) extend RunningContainerName's contract OR (b)
-// document why this call site needs to differ. Either way: the drift
-// gets a reviewer's attention instead of shipping silently.
-func TestFindRunningContainer_RoutesThroughProvisionerSSOT(t *testing.T) {
-	fset := token.NewFileSet()
-	file, err := parser.ParseFile(fset, "plugins.go", nil, parser.ParseComments)
-	if err != nil {
-		t.Fatalf("parse plugins.go: %v", err)
-	}
-
-	var fn *ast.FuncDecl
-	ast.Inspect(file, func(n ast.Node) bool {
-		f, ok := n.(*ast.FuncDecl)
-		if !ok || f.Name.Name != "findRunningContainer" {
-			return true
-		}
-		// Confirm receiver is *PluginsHandler so we don't pick up an unrelated
-		// helper of the same name. ast.Recv is a FieldList — receivers carry
-		// at most one field.
-		if f.Recv == nil || len(f.Recv.List) == 0 {
-			return true
-		}
-		fn = f
-		return false
-	})
-
-	if fn == nil {
-		t.Fatal("findRunningContainer not found in plugins.go — was it renamed? update this test or the SSOT routing assumption")
-	}
-
-	var (
-		callsRunningContainerName bool
-		callsContainerInspectRaw  bool
-	)
-	ast.Inspect(fn.Body, func(n ast.Node) bool {
-		call, ok := n.(*ast.CallExpr)
-		if !ok {
-			return true
-		}
-		sel, ok := call.Fun.(*ast.SelectorExpr)
-		if !ok {
-			return true
-		}
-		// Pkg.Func form: provisioner.RunningContainerName(...)
-		if pkgIdent, ok := sel.X.(*ast.Ident); ok {
-			if pkgIdent.Name == "provisioner" && sel.Sel.Name == "RunningContainerName" {
-				callsRunningContainerName = true
-			}
-		}
-		// Receiver-then-method form: h.docker.ContainerInspect(...) /
-		// p.cli.ContainerInspect(...) — anything ending in
-		// .ContainerInspect that's NOT routed through provisioner.
-		if sel.Sel.Name == "ContainerInspect" {
-			callsContainerInspectRaw = true
-		}
-		return true
-	})
-
-	if !callsRunningContainerName {
-		t.Errorf(
-			"findRunningContainer must call provisioner.RunningContainerName for the SSOT inspect — see molecule-core#10. Found no such call.",
-		)
-	}
-	if callsContainerInspectRaw {
-		t.Errorf(
-			"findRunningContainer carries a direct ContainerInspect call. This is the parallel-impl drift molecule-core#10 fixed. " +
-				"Either route through provisioner.RunningContainerName OR — if a new use case truly needs a different inspect — extend RunningContainerName's contract first and update this gate to allow the specific delta.",
-		)
-	}
-}
-
-// TestProvisionerIsRunning_RoutesThroughRunningContainerName mirrors the
-// gate above but for the OTHER consumer of the SSOT — Provisioner.IsRunning
-// (called by healthsweep). If a future refactor makes IsRunning carry its
-// own ContainerInspect again, the two consumers' edge-case behaviors will
-// silently drift. Keep them yoked.
-func TestProvisionerIsRunning_RoutesThroughRunningContainerName(t *testing.T) {
-	fset := token.NewFileSet()
-	file, err := parser.ParseFile(fset, "../provisioner/provisioner.go", nil, parser.ParseComments)
-	if err != nil {
-		t.Fatalf("parse provisioner.go: %v", err)
-	}
-
-	var fn *ast.FuncDecl
-	ast.Inspect(file, func(n ast.Node) bool {
-		f, ok := n.(*ast.FuncDecl)
-		if !ok || f.Name.Name != "IsRunning" || f.Recv == nil {
-			return true
-		}
-		// The receiver type must be *Provisioner specifically. CPProvisioner
-		// has its own IsRunning that talks HTTP to the controlplane and is
-		// out of scope for this gate.
-		if !receiverIs(f, "Provisioner") {
-			return true
-		}
-		fn = f
-		return false
-	})
-	if fn == nil {
-		t.Fatal("Provisioner.IsRunning not found — was it renamed? update this test")
-	}
-
-	var (
-		callsRunningContainerName bool
-		callsContainerInspectRaw  bool
-	)
-	ast.Inspect(fn.Body, func(n ast.Node) bool {
-		call, ok := n.(*ast.CallExpr)
-		if !ok {
-			return true
-		}
-		// Same-package call: bare identifier (e.g. RunningContainerName(...)).
-		if id, ok := call.Fun.(*ast.Ident); ok && id.Name == "RunningContainerName" {
-			callsRunningContainerName = true
-			return true
-		}
-		// Selector call: pkg.Func (e.g. provisioner.RunningContainerName)
-		// OR recv.Method (e.g. p.cli.ContainerInspect).
-		sel, ok := call.Fun.(*ast.SelectorExpr)
-		if !ok {
-			return true
-		}
-		switch sel.Sel.Name {
-		case "RunningContainerName":
-			callsRunningContainerName = true
-		case "ContainerInspect":
-			callsContainerInspectRaw = true
-		}
-		return true
-	})
-
-	if !callsRunningContainerName {
-		t.Errorf("Provisioner.IsRunning must call RunningContainerName for the SSOT inspect — see molecule-core#10")
-	}
-	if callsContainerInspectRaw {
-		t.Errorf("Provisioner.IsRunning carries a direct ContainerInspect call; route through RunningContainerName instead")
-	}
-}
-
-// receiverIs reports whether fn's receiver is `*<typeName>` or `<typeName>`.
-func receiverIs(fn *ast.FuncDecl, typeName string) bool {
-	if fn.Recv == nil || len(fn.Recv.List) == 0 {
-		return false
-	}
-	expr := fn.Recv.List[0].Type
-	if star, ok := expr.(*ast.StarExpr); ok {
-		expr = star.X
-	}
-	id, ok := expr.(*ast.Ident)
-	return ok && strings.EqualFold(id.Name, typeName)
-}
@@ -32,18 +32,6 @@ import (
 // inside the workspace at startup.
 func (h *PluginsHandler) Install(c *gin.Context) {
 	workspaceID := c.Param("id")
-	// External-runtime guard (molecule-core#10): push-install via docker
-	// exec is meaningless for `runtime='external'` workspaces — they have
-	// no local container. Reject early with a hint pointing at the
-	// pull-mode endpoint, instead of falling through to a misleading
-	// "container not running" 503 from findRunningContainer.
-	if h.isExternalRuntime(workspaceID) {
-		c.JSON(http.StatusUnprocessableEntity, gin.H{
-			"error": "plugin install via push is not supported for external runtimes",
-			"hint":  "external workspaces pull plugins via GET /workspaces/:id/plugins/:name/download",
-		})
-		return
-	}
 	// Cap the JSON body so a pathological POST can't exhaust parser memory.
 	bodyMax := envx.Int64("PLUGIN_INSTALL_BODY_MAX_BYTES", defaultInstallBodyMaxBytes)
 	c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, bodyMax)
@@ -105,16 +93,6 @@ func (h *PluginsHandler) Uninstall(c *gin.Context) {
 	pluginName := c.Param("name")
 	ctx := c.Request.Context()

-	// Mirror Install's external-runtime guard (molecule-core#10) so the
-	// two endpoints reject the same shape with the same message.
-	if h.isExternalRuntime(workspaceID) {
-		c.JSON(http.StatusUnprocessableEntity, gin.H{
-			"error": "plugin uninstall via docker exec is not supported for external runtimes",
-			"hint":  "external workspaces manage their own plugin directory; remove it locally",
-		})
-		return
-	}
-
 	if err := validatePluginName(pluginName); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid plugin name"})
 		return
@@ -1,176 +0,0 @@
-package handlers
-
-import (
-	"bytes"
-	"net/http"
-	"net/http/httptest"
-	"strings"
-	"testing"
-
-	"github.com/gin-gonic/gin"
-)
-
-// TestPluginInstall_ExternalRuntime_Returns422 — molecule-core#10.
-// Install on a `runtime='external'` workspace must NOT fall through to
-// findRunningContainer (which would 503 with a misleading "container not
-// running"). It must return 422 with a hint pointing at the pull-mode
-// download endpoint.
-func TestPluginInstall_ExternalRuntime_Returns422(t *testing.T) {
-	h := NewPluginsHandler(t.TempDir(), nil, nil).
-		WithRuntimeLookup(func(workspaceID string) (string, error) {
-			return "external", nil
-		})
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: "ba1789b0-4d21-4f4f-a878-fa226bf77cf5"}}
-	c.Request = httptest.NewRequest(
-		"POST",
-		"/workspaces/ba1789b0-4d21-4f4f-a878-fa226bf77cf5/plugins",
-		bytes.NewBufferString(`{"source":"local://my-plugin"}`),
-	)
-	c.Request.Header.Set("Content-Type", "application/json")
-
-	h.Install(c)
-
-	if w.Code != http.StatusUnprocessableEntity {
-		t.Errorf("expected 422 (Unprocessable Entity) for runtime='external', got %d: %s", w.Code, w.Body.String())
-	}
-	if !strings.Contains(w.Body.String(), "external runtimes") {
-		t.Errorf("expected error body to mention 'external runtimes', got: %s", w.Body.String())
-	}
-	if !strings.Contains(w.Body.String(), "download") {
-		t.Errorf("expected error body to point at the download endpoint, got: %s", w.Body.String())
-	}
-}
-
-// TestPluginUninstall_ExternalRuntime_Returns422 — symmetric guard on the
-// uninstall path (DELETE /workspaces/:id/plugins/:name). External
-// workspaces manage their own plugin directory locally; the platform
-// can't docker-exec into them.
-func TestPluginUninstall_ExternalRuntime_Returns422(t *testing.T) {
-	h := NewPluginsHandler(t.TempDir(), nil, nil).
-		WithRuntimeLookup(func(workspaceID string) (string, error) {
-			return "external", nil
-		})
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{
-		{Key: "id", Value: "ba1789b0-4d21-4f4f-a878-fa226bf77cf5"},
-		{Key: "name", Value: "my-plugin"},
-	}
-	c.Request = httptest.NewRequest(
-		"DELETE",
-		"/workspaces/ba1789b0-4d21-4f4f-a878-fa226bf77cf5/plugins/my-plugin",
-		nil,
-	)
-
-	h.Uninstall(c)
-
-	if w.Code != http.StatusUnprocessableEntity {
-		t.Errorf("expected 422 for runtime='external', got %d: %s", w.Code, w.Body.String())
-	}
-	if !strings.Contains(w.Body.String(), "external runtimes") {
-		t.Errorf("expected error body to mention 'external runtimes', got: %s", w.Body.String())
-	}
-}
-
-// TestPluginInstall_ContainerBackedRuntime_FallsThroughGuard — the runtime
-// guard MUST NOT short-circuit container-backed runtimes. With
-// `runtime='claude-code'` the install proceeds past the guard; without a
-// real plugin source it'll fail downstream (here: 404 from local resolver
-// because no plugin staged), which is the correct error to surface.
-//
-// This is the mutation-test partner: deleting the `runtime == "external"`
-// check would still pass TestPluginInstall_ExternalRuntime (because Install
-// would 404 instead of 422 — but the test asserts 422), and would still
-// pass this test (because both pre-fix and post-fix produce 404 here).
-// What this case pins is "non-external still falls through," catching
-// any over-eager guard that rejects all runtimes.
-func TestPluginInstall_ContainerBackedRuntime_FallsThroughGuard(t *testing.T) {
-	h := NewPluginsHandler(t.TempDir(), nil, nil).
-		WithRuntimeLookup(func(workspaceID string) (string, error) {
-			return "claude-code", nil
-		})
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: "c7c28c0b-4ea5-4e75-9728-3ba860081708"}}
-	c.Request = httptest.NewRequest(
-		"POST",
-		"/workspaces/c7c28c0b-4ea5-4e75-9728-3ba860081708/plugins",
-		bytes.NewBufferString(`{"source":"local://nonexistent-plugin"}`),
-	)
-	c.Request.Header.Set("Content-Type", "application/json")
-
-	h.Install(c)
-
-	if w.Code == http.StatusUnprocessableEntity {
-		t.Errorf("runtime='claude-code' must fall through the external guard; got 422: %s", w.Body.String())
-	}
-	// The local resolver will fail to find the plugin → 404. Anything
-	// other than 422 (which would mean we mis-classified) is fine.
-	if w.Code != http.StatusNotFound {
-		t.Errorf("expected 404 (plugin not found in registry), got %d: %s", w.Code, w.Body.String())
-	}
-}
-
-// TestPluginInstall_NoRuntimeLookup_FailsOpen — when the runtime lookup
-// is unwired (test fixtures, niche deploy shapes) the guard MUST default
-// to allowing the install attempt. The downstream findRunningContainer
-// step still gates on a real container, so failing open here doesn't
-// expose a bypass — it just preserves backwards-compat with deployments
-// that haven't wired the lookup.
-func TestPluginInstall_NoRuntimeLookup_FailsOpen(t *testing.T) {
-	h := NewPluginsHandler(t.TempDir(), nil, nil) // NO WithRuntimeLookup
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: "ws-no-lookup"}}
-	c.Request = httptest.NewRequest(
-		"POST",
-		"/workspaces/ws-no-lookup/plugins",
-		bytes.NewBufferString(`{"source":"local://nonexistent"}`),
-	)
-	c.Request.Header.Set("Content-Type", "application/json")
-
-	h.Install(c)
-
-	if w.Code == http.StatusUnprocessableEntity {
-		t.Errorf("nil runtimeLookup must fall through (fail-open); got 422: %s", w.Body.String())
-	}
-}
-
-// TestPluginInstall_RuntimeLookupErrors_FailsOpen — same fail-open story
-// for transient DB errors in the lookup. We don't want a momentary
-// Postgres hiccup to flip every plugin install into a 422.
-func TestPluginInstall_RuntimeLookupErrors_FailsOpen(t *testing.T) {
-	h := NewPluginsHandler(t.TempDir(), nil, nil).
-		WithRuntimeLookup(func(workspaceID string) (string, error) {
-			return "", errFakeDB
-		})
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: "ws-db-flake"}}
-	c.Request = httptest.NewRequest(
-		"POST",
-		"/workspaces/ws-db-flake/plugins",
-		bytes.NewBufferString(`{"source":"local://nonexistent"}`),
-	)
-	c.Request.Header.Set("Content-Type", "application/json")
-
-	h.Install(c)
-
-	if w.Code == http.StatusUnprocessableEntity {
-		t.Errorf("runtimeLookup error must fall through (fail-open); got 422: %s", w.Body.String())
-	}
-}
-
-// errFakeDB is a sentinel for the fail-open lookup-error case.
-var errFakeDB = &fakeError{msg: "synthetic db error"}
-
-type fakeError struct{ msg string }
-
-func (e *fakeError) Error() string { return e.msg }
@@ -78,10 +78,6 @@ var fallbackRuntimes = map[string]struct{}{
 	"openclaw":    {},
 	"codex":       {},
 	"external":    {},
-	// mock — virtual workspace with hardcoded canned A2A replies.
-	// No container, no EC2, no template repo. See mock_runtime.go
-	// for the full rationale (200-workspace funding-demo org).
-	"mock": {},
 }

 // loadRuntimesFromManifest builds the runtime allowlist from
@@ -108,10 +104,6 @@ func loadRuntimesFromManifest(path string) (map[string]struct{}, error) {
 		// the manifest doesn't know about it. Injected here so we
 		// don't need a special-case in every caller.
 		"external": {},
-		// mock is ALWAYS available for the same reason as external:
-		// virtual workspace, no template repo, never spawns a
-		// container. See mock_runtime.go.
-		"mock": {},
 	}
 	for _, e := range m.WorkspaceTemplates {
 		name := strings.TrimSpace(e.Name)
@@ -112,19 +112,6 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
 		return
 	}

-	// runtime=mock: virtual workspace with canned A2A replies. No
-	// container, no EC2, no provisioning state to recycle. Mirror
-	// the external no-op so the canvas's Restart button doesn't
-	// silently fail or leak through to the (template-less) provisioner.
-	if dbRuntime == "mock" {
-		c.JSON(http.StatusOK, gin.H{
-			"status":  "noop",
-			"runtime": "mock",
-			"message": "mock workspaces have no container — restart is a no-op",
-		})
-		return
-	}
-
 	// SaaS mode: cpProv handles workspace EC2 lifecycle. Self-hosted mode:
 	// provisioner handles local Docker containers. At least one must be
 	// available — previously only `provisioner` was checked, which broke
@@ -545,9 +532,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 	}

 	// Don't auto-restart external workspaces (no Docker container)
-	// or mock workspaces (no container, every reply is canned —
-	// see workspace-server/internal/handlers/mock_runtime.go).
-	if dbRuntime == "external" || dbRuntime == "mock" {
+	if dbRuntime == "external" {
 		return
 	}

@@ -1,7 +1,6 @@
 package handlers

 import (
-	"runtime"
 	"sync"
 	"sync/atomic"
 	"testing"
@@ -16,42 +15,6 @@ func resetRestartStatesFor(workspaceID string) {
 	restartStates.Delete(workspaceID)
 }

-// drainCoalesceGoroutine spawns `coalesceRestart(wsID, cycle)` on a
-// goroutine that mirrors the real production caller shape
-// (`go h.RestartByID(...)` from a2a_proxy.go, a2a_proxy_helpers.go,
-// main.go), and registers a t.Cleanup that blocks until the goroutine
-// has TERMINATED — not just panicked-and-recovered, fully exited.
-//
-// This is the bleed-prevention contract for Class H (Task #170): no
-// test in this file may declare itself complete while a coalesceRestart
-// goroutine it spawned is still alive, because that goroutine could
-// otherwise wake up after the test's sqlmock has been closed and
-// either:
-//   - issue a stale INSERT that gets attributed to the next test's
-//     sqlmock connection — surfaces as
-//     "INSERT-not-expected for kind=DELEGATION_FAILED" / =WORKSPACE_PROVISION_FAILED
-//     in a neighbour test that doesn't itself touch coalesceRestart; or
-//   - hold a reference to the closed *sql.DB and panic on the next op.
-//
-// Implementation notes:
-//   - sync.WaitGroup must be Add()ed BEFORE the goroutine is spawned;
-//     Add inside the goroutine races with Wait.
-//   - t.Cleanup runs in LIFO order, so this composes safely with other
-//     cleanups (e.g. setupTestDB's mockDB.Close).
-//   - We don't bound the Wait with a timeout — if the goroutine
-//     genuinely deadlocks, the whole test process should hang and fail
-//     under -timeout. A timeout-then-orphan would mask the bleed.
-func drainCoalesceGoroutine(t *testing.T, wsID string, cycle func()) {
-	t.Helper()
-	var wg sync.WaitGroup
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-		coalesceRestart(wsID, cycle)
-	}()
-	t.Cleanup(wg.Wait)
-}
-
 // TestCoalesceRestart_SingleCallRunsOneCycle is the baseline:
 // no concurrency, one cycle. If this fails the gate logic is broken at
 // its simplest path.
@@ -237,45 +200,19 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
 	const wsID = "test-coalesce-panic-recovery"
 	resetRestartStatesFor(wsID)

-	// Spawn the panicking cycle on a goroutine via drainCoalesceGoroutine
-	// — this mirrors the real production callsite shape
-	// (`go h.RestartByID(...)` from a2a_proxy.go:584,
-	// a2a_proxy_helpers.go:197, main.go:213). The previous form called
-	// coalesceRestart synchronously, which neither exercised the
-	// goroutine-survival contract nor caught Class H bleed regressions
-	// where the panic-recovery goroutine outlives the test and pollutes
-	// the next test's sqlmock with INSERTs from runRestartCycle's
-	// LogActivity calls (kinds DELEGATION_FAILED / WORKSPACE_PROVISION_FAILED).
-	//
-	// drainCoalesceGoroutine registers a t.Cleanup that Wait()s for the
-	// goroutine to TERMINATE — not merely panic-and-recover — before
-	// the test ends.
-	drainCoalesceGoroutine(t, wsID, func() { panic("simulated cycle failure") })
-
-	// We need a mid-test barrier (not just the t.Cleanup-time barrier)
-	// so the second coalesceRestart below sees state.running=false. The
-	// goroutine clears state.running inside its deferred recover; poll
-	// the package-level restartStates map until that observable flip
-	// happens. Bound at 2s — longer = real bug.
-	deadline := time.Now().Add(2 * time.Second)
-	for time.Now().Before(deadline) {
-		sv, ok := restartStates.Load(wsID)
-		if ok {
-			st := sv.(*restartState)
-			st.mu.Lock()
-			running := st.running
-			st.mu.Unlock()
-			if !running {
-				break
-			}
+	// First call's cycle panics. coalesceRestart's defer must swallow
+	// the panic so this test caller doesn't see it propagate up — that
+	// matches what the real production caller (`go h.RestartByID(...)`)
+	// gets: the goroutine survives, no process crash.
+	defer func() {
+		if r := recover(); r != nil {
+			t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r)
 		}
-		time.Sleep(time.Millisecond)
-	}
+	}()
+	coalesceRestart(wsID, func() { panic("simulated cycle failure") })

 	// Second call must run a fresh cycle. If running stayed true after
 	// the panic, this call would early-return without invoking cycle.
-	// Synchronous — no panic, so no goroutine to drain, and we want to
-	// assert ran.Load() immediately after.
 	var ran atomic.Bool
 	coalesceRestart(wsID, func() { ran.Store(true) })
 	if !ran.Load() {
@@ -283,98 +220,6 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
 	}
 }

-// TestCoalesceRestart_DrainHelperWaitsForGoroutineExit is the Class H
-// regression guard for Task #170. It asserts the contract enforced by
-// drainCoalesceGoroutine: t.Cleanup blocks until the spawned
-// coalesceRestart goroutine has FULLY EXITED — not merely recovered
-// from panic. This is the contract that prevents stale LogActivity
-// INSERTs from a recovering goroutine bleeding into the next test's
-// sqlmock (the failure mode reported as "INSERT-not-expected for
-// kind=DELEGATION_FAILED" in TestPooledWithEICTunnel_PreservesFnErr).
-//
-// We use a deterministic bleed-shape probe rather than goroutine-count
-// arithmetic: the cycle blocks on a release channel for ~150ms — long
-// enough that without a Wait barrier, the outer sub-test would return
-// before the goroutine exited. We then verify the wg.Wait inside
-// drainCoalesceGoroutine actually delayed t.Run's completion: total
-// elapsed must be >= the block duration. Asserts exact-shape, not
-// substring (per saved-memory feedback_assert_exact_not_substring):
-// elapsed < blockFor would mean the cleanup didn't wait, which is the
-// exact bleed we're guarding against.
-//
-// We additionally panic from the cycle (after the block) to confirm
-// the helper waits past panic recovery, not just past cycle return.
-func TestCoalesceRestart_DrainHelperWaitsForGoroutineExit(t *testing.T) {
-	const blockFor = 150 * time.Millisecond
-	const wsID = "test-coalesce-drain-helper-contract"
-	resetRestartStatesFor(wsID)
-
-	// done is closed inside the cycle, AFTER the block + AFTER the
-	// panic (which the deferred recover in coalesceRestart catches).
-	// Actually: defer in cycle runs before panic propagates to the
-	// outer recover. Use defer to close.
-	exited := make(chan struct{})
-
-	subStart := time.Now()
-	t.Run("drain_under_subtest", func(st *testing.T) {
-		drainCoalesceGoroutine(st, wsID, func() {
-			defer close(exited)
-			time.Sleep(blockFor)
-			panic("contract-test panic-after-block")
-		})
-		// st.Cleanup runs here, before t.Run returns. wg.Wait must
-		// block until the goroutine has finished its panic recovery.
-	})
-	subElapsed := time.Since(subStart)
-
-	// Contract: the helper's wg.Wait MUST have blocked t.Run from
-	// returning until after the cycle's block + panic recovery.
-	if subElapsed < blockFor {
-		t.Fatalf(
-			"drainCoalesceGoroutine contract violated: t.Run returned in %v, "+
-				"but cycle blocks for %v. The Wait barrier is broken — a "+
-				"coalesceRestart goroutine can outlive its test's t.Cleanup "+
-				"and pollute neighbour-test sqlmock state (Class H bleed).",
-			subElapsed, blockFor,
-		)
-	}
-
-	// And the goroutine must have actually closed `exited` (i.e. ran
-	// the deferred close before panic propagated through coalesceRestart's
-	// recover). If exited is still open here, the goroutine never
-	// reached the close — meaning either the panic short-circuited the
-	// defer (Go runtime bug — won't happen) or the goroutine never
-	// ran at all (drainCoalesceGoroutine spawn shape regressed).
-	select {
-	case <-exited:
-		// Correct path.
-	default:
-		t.Fatal("cycle goroutine never reached its deferred close — panic-recovery contract regressed")
-	}
-
-	// Belt-and-suspenders: the post-recover state-clear must have
-	// flipped state.running back to false. If this fails, the panic
-	// path skipped the deferred state-clear in coalesceRestart.
-	sv, ok := restartStates.Load(wsID)
-	if !ok {
-		t.Fatal("restartStates entry missing for wsID after cycle — sync.Map regression")
-	}
-	st := sv.(*restartState)
-	st.mu.Lock()
-	running := st.running
-	st.mu.Unlock()
-	if running {
-		t.Error("state.running was not cleared after panic — sticky-running deadlock regressed")
-	}
-
-	// Reference runtime.NumGoroutine to keep the runtime import
-	// honest — also a useful smoke check that the goroutine count
-	// hasn't ballooned 10x while debugging this test.
-	if n := runtime.NumGoroutine(); n > 200 {
-		t.Logf("warning: NumGoroutine=%d after drain — high but not necessarily a leak", n)
-	}
-}
-
 // TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the
 // per-workspace state map: an in-flight restart for ws A must not
 // block restarts for ws B. Important for performance — without this,
@@ -110,10 +110,55 @@ func (s *PostgresMessageStore) List(ctx context.Context, workspaceID string, opt
 		return nil, false, err
 	}

+	// Wire order: oldest-first within the page so canvas (and any
+	// future client) can render chronologically without per-pair
+	// reordering. The SQL is `ORDER BY created_at DESC LIMIT N` for
+	// pagination correctness, and activityRowToChatMessages emits
+	// [user, agent] within a row — so a naive client-side flat-reverse
+	// would swap the pair (agent before user at the same timestamp).
+	// Reversing ROW-AWARE here keeps the wire shape display-ready.
+	//
+	// Algorithm: group consecutive same-timestamp messages into row
+	// chunks (1-2 messages each), reverse the chunk order, flatten.
+	// Within-row [user, agent] order is preserved. Single-message
+	// rows (no agent reply yet, or attachments-only) collapse to
+	// 1-element chunks and still reverse correctly.
+	messages = reverseRowChunks(messages)
+
 	reachedEnd := rowCount < opts.Limit
 	return messages, reachedEnd, nil
 }

+// reverseRowChunks groups msgs by adjacent same-Timestamp runs and
+// reverses the run order, preserving within-run order. Pairs of
+// (user, agent) emitted by activityRowToChatMessages share a
+// timestamp, so this keeps each pair internally ordered while
+// reversing the row sequence.
+func reverseRowChunks(msgs []ChatMessage) []ChatMessage {
+	if len(msgs) == 0 {
+		return msgs
+	}
+	var chunks [][]ChatMessage
+	cur := []ChatMessage{msgs[0]}
+	for i := 1; i < len(msgs); i++ {
+		if msgs[i].Timestamp == cur[len(cur)-1].Timestamp {
+			cur = append(cur, msgs[i])
+		} else {
+			chunks = append(chunks, cur)
+			cur = []ChatMessage{msgs[i]}
+		}
+	}
+	chunks = append(chunks, cur)
+	for i, j := 0, len(chunks)-1; i < j; i, j = i+1, j-1 {
+		chunks[i], chunks[j] = chunks[j], chunks[i]
+	}
+	out := make([]ChatMessage, 0, len(msgs))
+	for _, chunk := range chunks {
+		out = append(out, chunk...)
+	}
+	return out
+}
+
 // queryActivityRows is split from List so unit tests can exercise the
 // parser without spinning a real DB. Internal — alternative impls
 // shouldn't depend on the SQL shape.
@@ -14,10 +14,13 @@ package messagestore
 // legacy source the server replaces; divergence == regression.

 import (
+	"context"
 	"encoding/json"
 	"strings"
 	"testing"
 	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
 )

 const fixedTimestamp = "2026-04-25T18:00:00Z"
@@ -282,6 +285,145 @@ func TestChatHistory_NoAgentMessageWhenResponseHasNoTextNoFiles(t *testing.T) {
 	}
 }

+// =====================================================================
+// List() integration — sqlmock-backed end-to-end via the real handler
+// =====================================================================
+
+// TestList_WireOrderIsOldestFirstAcrossPagedRows pins the integration
+// invariant: List() returns wire-display-ready messages even though
+// the underlying SQL is `ORDER BY created_at DESC`. This is the
+// load-bearing test for PR-C-2 — without the row-aware reversal,
+// canvas would render every paired bubble in the wrong order on every
+// chat reload (agent before user within each timestamp).
+//
+// Mutation-test cover: removing the `messages = reverseRowChunks(...)`
+// call in List() must turn this test red. (The lower-level
+// TestReverseRowChunks_PreservesPairOrderAcrossRows pins the helper
+// itself; this test pins that List ACTUALLY CALLS the helper.)
+func TestList_WireOrderIsOldestFirstAcrossPagedRows(t *testing.T) {
+	db, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer db.Close()
+
+	// Server's SQL is ORDER BY created_at DESC. Build mock rows in
+	// THAT order so the row-aware reversal has work to do.
+	rows := sqlmock.NewRows([]string{"created_at", "status", "request_body", "response_body"}).
+		AddRow(mustParseTime(t, "2026-05-05T00:03:00Z"), "ok",
+			`{"params":{"message":{"parts":[{"kind":"text","text":"u3"}]}}}`,
+			`{"result":"a3"}`).
+		AddRow(mustParseTime(t, "2026-05-05T00:02:00Z"), "ok",
+			`{"params":{"message":{"parts":[{"kind":"text","text":"u2"}]}}}`,
+			`{"result":"a2"}`).
+		AddRow(mustParseTime(t, "2026-05-05T00:01:00Z"), "ok",
+			`{"params":{"message":{"parts":[{"kind":"text","text":"u1"}]}}}`,
+			`{"result":"a1"}`)
+
+	mock.ExpectQuery(`SELECT created_at, status, request_body::text, response_body::text`).
+		WillReturnRows(rows)
+
+	store := NewPostgresMessageStore(db)
+	msgs, reachedEnd, err := store.List(context.Background(), "ws-1", ListOptions{Limit: 10})
+	if err != nil {
+		t.Fatalf("List: %v", err)
+	}
+
+	wantContents := []string{"u1", "a1", "u2", "a2", "u3", "a3"}
+	if len(msgs) != len(wantContents) {
+		t.Fatalf("len(msgs)=%d want %d; got=%v", len(msgs), len(wantContents), msgs)
+	}
+	for i, w := range wantContents {
+		if msgs[i].Content != w {
+			t.Errorf("idx %d: got %q want %q (full slice ordering broken; reverseRowChunks regressed?)", i, msgs[i].Content, w)
+		}
+	}
+	if !reachedEnd {
+		t.Errorf("3 rows < limit 10 should reach end, got reachedEnd=false")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations: %v", err)
+	}
+}
+
+// =====================================================================
+// reverseRowChunks — wire-order helper added in PR-C-2
+// =====================================================================
+
+// TestReverseRowChunks_PreservesPairOrderAcrossRows pins the
+// row-aware reversal that List() applies before returning. Server's
+// SQL is `ORDER BY created_at DESC`, so messages come out
+// newest-row-first; activityRowToChatMessages emits [user, agent]
+// per row with same timestamp. A naive flat reversal of the messages
+// slice would flip each pair (agent before user). reverseRowChunks
+// reverses ROWS, preserving pair-internal order. Without this, canvas
+// would render every paired bubble in the wrong order on every chat
+// reload — the canvas-side reverse used to do the right thing because
+// it reversed ROWS BEFORE flattening, but PR-C/D moved the flattening
+// into the server, so the row-awareness has to live there too.
+func TestReverseRowChunks_PreservesPairOrderAcrossRows(t *testing.T) {
+	// Build messages newest-row-first as List() collects them. Each
+	// row is a pair sharing a timestamp, with [user, agent] order.
+	in := []ChatMessage{
+		{Role: "user", Content: "user_3", Timestamp: "2026-05-05T00:03:00Z"},
+		{Role: "agent", Content: "agent_3", Timestamp: "2026-05-05T00:03:00Z"},
+		{Role: "user", Content: "user_2", Timestamp: "2026-05-05T00:02:00Z"},
+		{Role: "agent", Content: "agent_2", Timestamp: "2026-05-05T00:02:00Z"},
+		{Role: "user", Content: "user_1", Timestamp: "2026-05-05T00:01:00Z"},
+		{Role: "agent", Content: "agent_1", Timestamp: "2026-05-05T00:01:00Z"},
+	}
+	got := reverseRowChunks(in)
+
+	want := []struct {
+		role, content string
+	}{
+		{"user", "user_1"}, {"agent", "agent_1"},
+		{"user", "user_2"}, {"agent", "agent_2"},
+		{"user", "user_3"}, {"agent", "agent_3"},
+	}
+	if len(got) != len(want) {
+		t.Fatalf("len(got)=%d len(want)=%d", len(got), len(want))
+	}
+	for i, w := range want {
+		if got[i].Role != w.role || got[i].Content != w.content {
+			t.Errorf("idx %d: got role=%q content=%q want role=%q content=%q",
+				i, got[i].Role, got[i].Content, w.role, w.content)
+		}
+	}
+}
+
+// TestReverseRowChunks_HandlesSingleMessageRows pins the case where
+// a row has only a user OR only an agent message (e.g., agent reply
+// not yet recorded, attachments-only user upload). Naive reversal
+// still works for single-message chunks; the test guards against a
+// future change that special-cases the 2-message-row path.
+func TestReverseRowChunks_HandlesSingleMessageRows(t *testing.T) {
+	in := []ChatMessage{
+		{Role: "user", Content: "u3", Timestamp: "2026-05-05T00:03:00Z"},
+		{Role: "user", Content: "u2", Timestamp: "2026-05-05T00:02:00Z"}, // single, no agent
+		{Role: "agent", Content: "a2", Timestamp: "2026-05-05T00:02:00Z"},
+		{Role: "user", Content: "u1", Timestamp: "2026-05-05T00:01:00Z"},
+	}
+	got := reverseRowChunks(in)
+	wantContents := []string{"u1", "u2", "a2", "u3"}
+	if len(got) != len(wantContents) {
+		t.Fatalf("len got=%d want=%d", len(got), len(wantContents))
+	}
+	for i, w := range wantContents {
+		if got[i].Content != w {
+			t.Errorf("idx %d: got %q want %q", i, got[i].Content, w)
+		}
+	}
+}
+
+// TestReverseRowChunks_EmptyInput returns nil/empty without panic.
+func TestReverseRowChunks_EmptyInput(t *testing.T) {
+	got := reverseRowChunks(nil)
+	if len(got) != 0 {
+		t.Errorf("nil input should return empty, got %v", got)
+	}
+}
+
 // =====================================================================
 // end-to-end shape — paired user + agent with same timestamp
 // =====================================================================
@@ -5,19 +5,17 @@ import (
 	"context"
 	"net/http"
 	"strconv"
-	"strings"
 	"sync"
 	"time"

 	"github.com/gin-gonic/gin"
 )

-// RateLimiter implements a token bucket rate limiter keyed by tenant
-// identity (org id, then bearer token, then client IP — see keyFor).
+// RateLimiter implements a simple token bucket rate limiter per IP.
 type RateLimiter struct {
-	mu       sync.Mutex
-	buckets  map[string]*bucket
-	rate     int // tokens per interval
+	mu      sync.Mutex
+	buckets map[string]*bucket
+	rate    int           // tokens per interval
 	interval time.Duration
 }

@@ -44,9 +42,9 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate
 			case <-ticker.C:
 				rl.mu.Lock()
 				cutoff := time.Now().Add(-10 * time.Minute)
-				for k, b := range rl.buckets {
+				for ip, b := range rl.buckets {
 					if b.lastReset.Before(cutoff) {
-						delete(rl.buckets, k)
+						delete(rl.buckets, ip)
 					}
 				}
 				rl.mu.Unlock()
@@ -56,73 +54,29 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate
 	return rl
 }

-// keyFor returns the bucket identifier for this request. Priority:
-//
-//  1. X-Molecule-Org-Id header — when present (CP-routed SaaS traffic),
-//     isolates tenants from each other regardless of the upstream proxy IP
-//     they all share.
-//  2. SHA-256 of Authorization Bearer token — when present (per-workspace
-//     bearer, ADMIN_TOKEN, org-scoped API token). On a per-tenant Caddy
-//     box where the org-id header isn't attached, this still distinguishes
-//     distinct user sessions on the same egress IP.
-//  3. ClientIP() — anonymous probes, /health scrapes, registry boot
-//     signals (when SetTrustedProxies(nil) is in effect, this is the
-//     direct TCP RemoteAddr — fine for the probe surface, not fine as a
-//     primary key behind a proxy, hence the priority order above).
-//
-// Mixing these namespaces is fine because they never collide: org ids
-// are UUIDs ("org:..."), token hashes are 64-char hex ("tok:..."), IPs
-// contain dots/colons ("ip:...").
-//
-// Security note on X-Molecule-Org-Id spoofing: the rate limiter runs
-// BEFORE TenantGuard, so the org-id value here is unvalidated. A caller
-// reaching workspace-server directly could spoof the header to drain
-// another org's bucket. In production this surface is closed by the
-// CP/Caddy front: tenant SGs reject :8080 from the public internet, and
-// CP rewrites the header to the verified org. If a future deployment
-// exposes :8080 directly, validate the org-id (e.g. against
-// MOLECULE_ORG_ID) before keying on it, or move this middleware after
-// TenantGuard. The token-hash and IP fallbacks are unspoofable.
-//
-// Issue #59 — replaces the previous IP-only keying that silently
-// collapsed all canvas traffic into one bucket once #179 disabled
-// proxy-header trust. See the issue for the deployment-shape analysis.
-func (rl *RateLimiter) keyFor(c *gin.Context) string {
-	if orgID := strings.TrimSpace(c.GetHeader("X-Molecule-Org-Id")); orgID != "" {
-		return "org:" + orgID
-	}
-	if tok := bearerFromHeader(c.GetHeader("Authorization")); tok != "" {
-		return "tok:" + tokenKey(tok)
-	}
-	return "ip:" + c.ClientIP()
-}
-
-// Middleware returns a Gin middleware that rate limits per caller. The
-// caller-key derivation lives in keyFor — see that function's doc for
-// the priority list and rationale.
+// Middleware returns a Gin middleware that rate limits by client IP.
 func (rl *RateLimiter) Middleware() gin.HandlerFunc {
 	return func(c *gin.Context) {
 		// Tier-1b dev-mode hatch — same gate as AdminAuth / WorkspaceAuth /
 		// discovery. On a local single-user Docker setup the 600-req/min
 		// bucket fills fast: a 15-workspace canvas + activity polling +
-		// approvals polling + A2A overlay + initial hydration all land in
-		// one bucket (whichever keyFor returns — typically the dev user's
-		// IP or shared admin token), so a minute of active use can trip
-		// 429 and blank the page. Gated by MOLECULE_ENV=development +
-		// empty ADMIN_TOKEN so SaaS production keeps the bucket.
+		// approvals polling + A2A overlay + initial hydration all share
+		// one IP bucket, so a minute of active use can trip 429 and blank
+		// the page. Gated by MOLECULE_ENV=development + empty ADMIN_TOKEN
+		// so SaaS production keeps the bucket.
 		if isDevModeFailOpen() {
 			c.Header("X-RateLimit-Limit", "unlimited")
 			c.Next()
 			return
 		}

-		key := rl.keyFor(c)
+		ip := c.ClientIP()

 		rl.mu.Lock()
-		b, exists := rl.buckets[key]
+		b, exists := rl.buckets[ip]
 		if !exists {
 			b = &bucket{tokens: rl.rate, lastReset: time.Now()}
-			rl.buckets[key] = b
+			rl.buckets[ip] = b
 		}

 		// Reset tokens if interval has passed
@@ -1,303 +0,0 @@
-package middleware
-
-import (
-	"context"
-	"crypto/sha256"
-	"fmt"
-	"go/ast"
-	"go/parser"
-	"go/token"
-	"net/http"
-	"net/http/httptest"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/gin-gonic/gin"
-)
-
-// newTestLimiterForKeyFor — same shape as newTestLimiter in ratelimit_test.go
-// but exposes the *gin.Engine and lets the caller inject headers per-request.
-func newTestLimiterForKeyFor(t *testing.T, rate int) *gin.Engine {
-	t.Helper()
-	gin.SetMode(gin.TestMode)
-	ctx, cancel := context.WithCancel(context.Background())
-	t.Cleanup(cancel)
-	rl := NewRateLimiter(rate, 5*time.Second, ctx)
-	r := gin.New()
-	if err := r.SetTrustedProxies(nil); err != nil {
-		t.Fatalf("SetTrustedProxies: %v", err)
-	}
-	r.Use(rl.Middleware())
-	r.GET("/x", func(c *gin.Context) { c.String(http.StatusOK, "ok") })
-	return r
-}
-
-// TestKeyFor_OrgIdHeaderTrumpsBearerAndIP — when X-Molecule-Org-Id is set
-// the bucket is keyed on it regardless of bearer token or IP. This is the
-// load-bearing case for the production SaaS plane: every tenant routed
-// through the same upstream proxy IP gets its own bucket because the
-// CP attaches the org-id header.
-func TestKeyFor_OrgIdHeaderTrumpsBearerAndIP(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-	ctx, cancel := context.WithCancel(context.Background())
-	t.Cleanup(cancel)
-	rl := NewRateLimiter(2, 5*time.Second, ctx)
-
-	c, _ := gin.CreateTestContext(httptest.NewRecorder())
-	c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
-	c.Request.RemoteAddr = "10.0.0.1:1234"
-	c.Request.Header.Set("X-Molecule-Org-Id", "org-aaa")
-	c.Request.Header.Set("Authorization", "Bearer ignored-token-value")
-
-	got := rl.keyFor(c)
-	if got != "org:org-aaa" {
-		t.Errorf("keyFor with org-id header: got %q, want %q", got, "org:org-aaa")
-	}
-}
-
-// TestKeyFor_BearerTokenWhenNoOrgId — the per-tenant Caddy box path:
-// no org-id header (canvas same-origin), but Authorization Bearer is
-// always set by WorkspaceAuth-protected routes. Bucket keyed on the
-// SHA-256 hex of the token so distinct sessions on the same egress IP
-// get distinct buckets — and so the in-memory map can never become a
-// token dump if the process is inspected.
-func TestKeyFor_BearerTokenWhenNoOrgId(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-	ctx, cancel := context.WithCancel(context.Background())
-	t.Cleanup(cancel)
-	rl := NewRateLimiter(2, 5*time.Second, ctx)
-
-	c, _ := gin.CreateTestContext(httptest.NewRecorder())
-	c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
-	c.Request.RemoteAddr = "10.0.0.1:1234"
-	c.Request.Header.Set("Authorization", "Bearer secret-token-abc")
-
-	got := rl.keyFor(c)
-	expectedHash := fmt.Sprintf("%x", sha256.Sum256([]byte("secret-token-abc")))
-	if got != "tok:"+expectedHash {
-		t.Errorf("keyFor with bearer-only: got %q, want %q", got, "tok:"+expectedHash)
-	}
-	// Critical security pin: raw token must never appear in the key.
-	if strings.Contains(got, "secret-token-abc") {
-		t.Errorf("keyFor leaked raw bearer token in bucket key: %q", got)
-	}
-}
-
-// TestKeyFor_IPFallbackWhenNoOrgIdNoBearer — anonymous probes (no auth,
-// no tenant header) fall through to ClientIP keying. This is the only
-// path that depended on the pre-#179 trust-XFF behaviour and is fine
-// to keep IP-keyed because the surface is just /health, /buildinfo,
-// and the registry-boot endpoints.
-func TestKeyFor_IPFallbackWhenNoOrgIdNoBearer(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-	ctx, cancel := context.WithCancel(context.Background())
-	t.Cleanup(cancel)
-	rl := NewRateLimiter(2, 5*time.Second, ctx)
-
-	c, _ := gin.CreateTestContext(httptest.NewRecorder())
-	c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
-	c.Request.RemoteAddr = "203.0.113.1:1234"
-
-	got := rl.keyFor(c)
-	// gin.ClientIP() strips the port — we just need to confirm the prefix
-	// and that the IP appears.
-	if !strings.HasPrefix(got, "ip:") {
-		t.Errorf("keyFor without auth/org headers: got %q, want prefix %q", got, "ip:")
-	}
-	if !strings.Contains(got, "203.0.113.1") {
-		t.Errorf("keyFor IP fallback: got %q, want to contain %q", got, "203.0.113.1")
-	}
-}
-
-// TestRateLimit_TwoOrgsSameIP_IndependentBuckets — the load-bearing
-// regression test for issue #59. Two tenants behind the same upstream
-// proxy must NOT share a bucket; the production SaaS-plane outage was
-// every tenant collapsing to the proxy IP and saturating one bucket.
-//
-// Mutation invariant: removing the org-id branch from keyFor — say,
-// returning "ip:" + c.ClientIP() unconditionally — collapses both
-// tenants back into one bucket and this test fails on the 3rd
-// request because it would 429 instead of 200.
-func TestRateLimit_TwoOrgsSameIP_IndependentBuckets(t *testing.T) {
-	r := newTestLimiterForKeyFor(t, 2)
-
-	exhaust := func(orgID string) {
-		t.Helper()
-		for i := 0; i < 2; i++ {
-			req := httptest.NewRequest(http.MethodGet, "/x", nil)
-			req.RemoteAddr = "10.0.0.1:1234" // SAME upstream proxy IP
-			req.Header.Set("X-Molecule-Org-Id", orgID)
-			w := httptest.NewRecorder()
-			r.ServeHTTP(w, req)
-			if w.Code != http.StatusOK {
-				t.Fatalf("setup orgID=%s req %d: want 200, got %d", orgID, i+1, w.Code)
-			}
-		}
-	}
-
-	exhaust("org-aaa")
-	// org-aaa is now at 0 tokens. org-bbb's bucket must be FRESH.
-	req := httptest.NewRequest(http.MethodGet, "/x", nil)
-	req.RemoteAddr = "10.0.0.1:1234"
-	req.Header.Set("X-Molecule-Org-Id", "org-bbb")
-	w := httptest.NewRecorder()
-	r.ServeHTTP(w, req)
-	if w.Code != http.StatusOK {
-		t.Fatalf("org-bbb on same IP must have its own bucket: got %d, want 200 (issue #59 regression)", w.Code)
-	}
-
-	// Confirm org-aaa is still throttled — proves we're not just opening
-	// the gate to everyone.
-	req = httptest.NewRequest(http.MethodGet, "/x", nil)
-	req.RemoteAddr = "10.0.0.1:1234"
-	req.Header.Set("X-Molecule-Org-Id", "org-aaa")
-	w = httptest.NewRecorder()
-	r.ServeHTTP(w, req)
-	if w.Code != http.StatusTooManyRequests {
-		t.Errorf("org-aaa exhausted bucket: want 429, got %d", w.Code)
-	}
-}
-
-// TestRateLimit_TwoTokensSameIP_IndependentBuckets — analog of the
-// org-id case for the per-tenant Caddy box: two distinct user
-// sessions on the same egress IP, distinguished only by their bearer
-// tokens, must get independent buckets. This was the path Hongming
-// hit on hongming.moleculesai.app — a single user with multiple
-// browser tabs against one workspace-server box.
-func TestRateLimit_TwoTokensSameIP_IndependentBuckets(t *testing.T) {
-	r := newTestLimiterForKeyFor(t, 2)
-
-	exhaust := func(token string) {
-		t.Helper()
-		for i := 0; i < 2; i++ {
-			req := httptest.NewRequest(http.MethodGet, "/x", nil)
-			req.RemoteAddr = "127.0.0.1:1234" // local Caddy proxy — same for both
-			req.Header.Set("Authorization", "Bearer "+token)
-			w := httptest.NewRecorder()
-			r.ServeHTTP(w, req)
-			if w.Code != http.StatusOK {
-				t.Fatalf("setup token=%s req %d: want 200, got %d", token, i+1, w.Code)
-			}
-		}
-	}
-
-	exhaust("user-a-token")
-	req := httptest.NewRequest(http.MethodGet, "/x", nil)
-	req.RemoteAddr = "127.0.0.1:1234"
-	req.Header.Set("Authorization", "Bearer user-b-token")
-	w := httptest.NewRecorder()
-	r.ServeHTTP(w, req)
-	if w.Code != http.StatusOK {
-		t.Fatalf("user-b token on same proxy IP must have its own bucket: got %d, want 200", w.Code)
-	}
-}
-
-// TestRateLimit_SameOrgDifferentTokens_SharedBucket — counter-pin:
-// ensure org-id keying really does collapse all tokens within one
-// org into one bucket. This is the desired behaviour: a tenant that
-// mints multiple tokens shouldn't be able to circumvent its quota
-// by rotating tokens between requests. (The same-IP-different-org
-// test above proves we don't collapse ACROSS orgs; this one proves
-// we DO collapse WITHIN one org.)
-func TestRateLimit_SameOrgDifferentTokens_SharedBucket(t *testing.T) {
-	r := newTestLimiterForKeyFor(t, 2)
-
-	for _, tok := range []string{"token-1", "token-2"} {
-		req := httptest.NewRequest(http.MethodGet, "/x", nil)
-		req.RemoteAddr = "10.0.0.1:1234"
-		req.Header.Set("X-Molecule-Org-Id", "org-shared")
-		req.Header.Set("Authorization", "Bearer "+tok)
-		w := httptest.NewRecorder()
-		r.ServeHTTP(w, req)
-		if w.Code != http.StatusOK {
-			t.Fatalf("setup tok=%s: want 200, got %d", tok, w.Code)
-		}
-	}
-	// Bucket should be exhausted now — third request, even with a fresh
-	// token, must 429 because the org-id is keying it.
-	req := httptest.NewRequest(http.MethodGet, "/x", nil)
-	req.RemoteAddr = "10.0.0.1:1234"
-	req.Header.Set("X-Molecule-Org-Id", "org-shared")
-	req.Header.Set("Authorization", "Bearer token-3")
-	w := httptest.NewRecorder()
-	r.ServeHTTP(w, req)
-	if w.Code != http.StatusTooManyRequests {
-		t.Errorf("rotating tokens within one org should NOT bypass the quota: got %d, want 429", w.Code)
-	}
-}
-
-// TestRateLimit_Middleware_RoutesThroughKeyFor is the AST gate (mirror
-// of #36/#10/#12's gates). Pins the SSOT routing invariant:
-// (*RateLimiter).Middleware MUST call rl.keyFor and MUST NOT carry a
-// direct c.ClientIP() call (= the parallel-impl drift this PR fixes).
-//
-// Mutation invariant: a future PR that re-introduces direct IP keying
-// in Middleware (`ip := c.ClientIP()`) makes this test fail. That's
-// the signal to either (a) extend keyFor's contract to cover the new
-// case OR (b) update this gate with an explicit reason. Either way the
-// drift gets a reviewer's attention before shipping.
-func TestRateLimit_Middleware_RoutesThroughKeyFor(t *testing.T) {
-	fset := token.NewFileSet()
-	file, err := parser.ParseFile(fset, "ratelimit.go", nil, parser.ParseComments)
-	if err != nil {
-		t.Fatalf("parse ratelimit.go: %v", err)
-	}
-
-	var fn *ast.FuncDecl
-	ast.Inspect(file, func(n ast.Node) bool {
-		f, ok := n.(*ast.FuncDecl)
-		if !ok {
-			return true
-		}
-		// Match `func (rl *RateLimiter) Middleware() ...`
-		if f.Name.Name != "Middleware" {
-			return true
-		}
-		if f.Recv == nil || len(f.Recv.List) != 1 {
-			return true
-		}
-		star, ok := f.Recv.List[0].Type.(*ast.StarExpr)
-		if !ok {
-			return true
-		}
-		if id, ok := star.X.(*ast.Ident); !ok || id.Name != "RateLimiter" {
-			return true
-		}
-		fn = f
-		return false
-	})
-	if fn == nil {
-		t.Fatal("(*RateLimiter).Middleware not found — was it renamed? update this gate or the SSOT routing assumption")
-	}
-
-	var (
-		callsKeyFor   bool
-		callsClientIP bool
-	)
-	ast.Inspect(fn.Body, func(n ast.Node) bool {
-		call, ok := n.(*ast.CallExpr)
-		if !ok {
-			return true
-		}
-		sel, ok := call.Fun.(*ast.SelectorExpr)
-		if !ok {
-			return true
-		}
-		switch sel.Sel.Name {
-		case "keyFor":
-			callsKeyFor = true
-		case "ClientIP":
-			callsClientIP = true
-		}
-		return true
-	})
-
-	if !callsKeyFor {
-		t.Error("(*RateLimiter).Middleware must call rl.keyFor for SSOT bucket-key derivation — see issue #59. Found no keyFor call.")
-	}
-	if callsClientIP {
-		t.Error("(*RateLimiter).Middleware carries a direct c.ClientIP() call. This is the parallel-impl drift issue #59 fixed. " +
-			"Either route through rl.keyFor OR — if a new use case truly needs direct IP — extend keyFor's contract first and update this gate to allow the specific delta.")
-	}
-}
@@ -1,545 +0,0 @@
-package provisioner
-
-import (
-	"context"
-	"crypto/sha256"
-	"encoding/hex"
-	"errors"
-	"fmt"
-	"io"
-	"log"
-	"net/http"
-	"net/url"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"strings"
-	"sync"
-	"time"
-)
-
-// Local-build mode: clone the workspace-template-<runtime> repo from Gitea
-// and `docker build` it on the host so OSS contributors can run molecule-core
-// end-to-end without authenticating to (or being able to reach) GHCR/ECR.
-//
-// The flow:
-//
-//  1. ensureLocalImage(runtime) is called by the provisioner before
-//     ContainerCreate, but only when Resolve().Mode == RegistryModeLocal.
-//  2. We compute a cache key from the Gitea repo's HEAD sha (one HTTP
-//     call to https://git.moleculesai.app/api/v1/repos/.../branches/main).
-//  3. If `molecule-local/workspace-template-<runtime>:<sha12>` already
-//     exists in the local Docker image store, we return immediately.
-//  4. Otherwise: shallow git-clone the repo into the cache dir, then
-//     `docker buildx build --platform=linux/amd64 -t <tag>` on it. We
-//     also tag `:latest` so `docker images` shows a friendly entry.
-//
-// Why amd64 emulation: the provisioner's defaultImagePlatform() forces
-// linux/amd64 on Apple Silicon for parity with the (amd64-only) prod
-// images. Building native arm64 in local-mode would diverge — see the
-// design rationale in Issue #63 and the saved memory
-// `feedback_local_must_mimic_production`.
-//
-// Auth: clone is anonymous (templates are public). If MOLECULE_GITEA_TOKEN
-// is set, we use it via the URL's userinfo — the token is masked in
-// every log line by maskTokenInURL().
-//
-// Failure mode: fail-closed. If Gitea is unreachable we surface a clear
-// error message including the repo URL; we NEVER fall back to GHCR/ECR
-// silently (would be a confusing bug for an OSS contributor who
-// happens to have stale ECR creds in their docker config).
-
-// gitTemplateRepoPrefix is the prefix all workspace-template repos live
-// under on Gitea. Hardcoded so an attacker who controlled cfg.Runtime
-// (defence-in-depth — today the field is platform-validated upstream)
-// can only ever reach a repo under molecule-ai/.
-//
-// Operators who want to point local-build at a fork can override the
-// full prefix via MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX (e.g.
-// `https://git.example.com/myorg/molecule-ai-workspace-template-`).
-// Default-off; opt-in only.
-const gitTemplateRepoPrefix = "https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-"
-
-// localBuildLockMap serializes concurrent ensureLocalImage calls per
-// runtime so two workspace creates that hit the cold path together don't
-// race on `docker build` (Docker's daemon would serialize anyway, but
-// the duplicate clone + log spam are confusing). Lock granularity is
-// per-runtime, so different runtimes still build in parallel.
-var (
-	localBuildLockMap   = make(map[string]*sync.Mutex)
-	localBuildLockMapMu sync.Mutex
-)
-
-func runtimeBuildLock(runtime string) *sync.Mutex {
-	localBuildLockMapMu.Lock()
-	defer localBuildLockMapMu.Unlock()
-	if m, ok := localBuildLockMap[runtime]; ok {
-		return m
-	}
-	m := &sync.Mutex{}
-	localBuildLockMap[runtime] = m
-	return m
-}
-
-// LocalBuildOptions controls the local-build path. Exposed so tests can
-// inject fakes without standing up a real git+docker chain. Production
-// uses zero-value defaults via newDefaultLocalBuildOptions().
-type LocalBuildOptions struct {
-	// CacheDir is the host filesystem location where cloned template
-	// repos are kept between builds. Empty = use $XDG_CACHE_HOME or
-	// $HOME/.cache. Override via env var MOLECULE_LOCAL_BUILD_CACHE.
-	CacheDir string
-
-	// RepoPrefix is the URL prefix all template repos hang off. Empty
-	// = use gitTemplateRepoPrefix. Override via env var
-	// MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX.
-	RepoPrefix string
-
-	// Token, if non-empty, is sent via URL userinfo to Gitea. Default
-	// empty (templates are public). Override via env var
-	// MOLECULE_GITEA_TOKEN.
-	Token string
-
-	// Platform is the buildx --platform value. Empty = host default;
-	// today we always pass linux/amd64 because the provisioner only
-	// runs amd64 images. Exposed so tests can override.
-	Platform string
-
-	// HTTPClient is used for the Gitea-API HEAD-sha lookup. Empty =
-	// http.DefaultClient with a 30s timeout.
-	HTTPClient *http.Client
-
-	// remoteHeadSha + dockerBuild + gitClone are seams for tests; if
-	// nil, the production implementations are used.
-	remoteHeadSha func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error)
-	gitClone      func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error
-	dockerBuild   func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error
-	dockerHasTag  func(ctx context.Context, tag string) (bool, error)
-	dockerTag     func(ctx context.Context, src, dst string) error
-}
-
-func newDefaultLocalBuildOptions() *LocalBuildOptions {
-	o := &LocalBuildOptions{
-		CacheDir:   os.Getenv("MOLECULE_LOCAL_BUILD_CACHE"),
-		RepoPrefix: os.Getenv("MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX"),
-		Token:      os.Getenv("MOLECULE_GITEA_TOKEN"),
-		Platform:   "linux/amd64",
-	}
-	if o.CacheDir == "" {
-		if xdg := os.Getenv("XDG_CACHE_HOME"); xdg != "" {
-			o.CacheDir = filepath.Join(xdg, "molecule", "workspace-template-build")
-		} else if home, err := os.UserHomeDir(); err == nil {
-			o.CacheDir = filepath.Join(home, ".cache", "molecule", "workspace-template-build")
-		} else {
-			// Last-resort fallback: /tmp. Loses the cache between reboots
-			// but at least lets the path produce builds.
-			o.CacheDir = filepath.Join(os.TempDir(), "molecule", "workspace-template-build")
-		}
-	}
-	if o.RepoPrefix == "" {
-		o.RepoPrefix = gitTemplateRepoPrefix
-	}
-	o.HTTPClient = &http.Client{Timeout: 30 * time.Second}
-	return o
-}
-
-// LocalImageTag formats the SHA-pinned tag for a runtime. Exported for
-// tests + the provisioner's image-resolution branch.
-func LocalImageTag(runtime, sha string) string {
-	short := sha
-	if len(short) > 12 {
-		short = short[:12]
-	}
-	return fmt.Sprintf("%s/workspace-template-%s:%s", localImagePrefix, runtime, short)
-}
-
-// LocalImageLatestTag returns the floating `:latest` form. Used as a
-// human-readable alias and as the value RuntimeImage() returns in
-// local-mode.
-func LocalImageLatestTag(runtime string) string {
-	return fmt.Sprintf("%s/workspace-template-%s:latest", localImagePrefix, runtime)
-}
-
-// EnsureLocalImage is the entry point the provisioner calls before
-// ContainerCreate when Resolve().Mode == RegistryModeLocal. Returns the
-// image tag (SHA-pinned form) the caller should hand to Docker, or an
-// error if the build/clone fails.
-//
-// Concurrency: per-runtime lock; parallel calls for the same runtime
-// share the build, parallel calls for different runtimes proceed.
-//
-// Idempotent: a cached SHA-pinned tag short-circuits without network
-// or docker calls. The Gitea HEAD lookup is the only network call on
-// the cache-hit path.
-func EnsureLocalImage(ctx context.Context, runtime string) (string, error) {
-	return ensureLocalImageWithOpts(ctx, runtime, newDefaultLocalBuildOptions())
-}
-
-// ensureLocalImageHook is the seam Start() calls into. Production code
-// uses EnsureLocalImage; tests substitute a fake to exercise the
-// provisioner-Start integration without standing up a real
-// git+docker chain. Single-process scoped — never reassigned in
-// production code.
-var ensureLocalImageHook = EnsureLocalImage
-
-func ensureLocalImageWithOpts(ctx context.Context, runtime string, opts *LocalBuildOptions) (string, error) {
-	if !IsKnownRuntime(runtime) {
-		return "", fmt.Errorf("local-build: refusing to build unknown runtime %q (must be one of %v)", runtime, knownRuntimes)
-	}
-
-	lock := runtimeBuildLock(runtime)
-	lock.Lock()
-	defer lock.Unlock()
-
-	// 1. HEAD lookup → cache key.
-	headFn := opts.remoteHeadSha
-	if headFn == nil {
-		headFn = remoteHeadShaProd
-	}
-	sha, err := headFn(ctx, opts, runtime)
-	if err != nil {
-		// Fail-closed: do not fall back to GHCR/ECR. The whole point of
-		// local-build mode is that GHCR is unreachable.
-		return "", fmt.Errorf("local-build: cannot determine HEAD sha for runtime %q at %s: %w", runtime, repoURL(opts, runtime), err)
-	}
-	if len(sha) < 12 {
-		return "", fmt.Errorf("local-build: Gitea returned a short sha %q for runtime %q (expected ≥12 chars)", sha, runtime)
-	}
-	tag := LocalImageTag(runtime, sha)
-	latest := LocalImageLatestTag(runtime)
-
-	// 2. Cache hit?
-	hasFn := opts.dockerHasTag
-	if hasFn == nil {
-		hasFn = dockerHasTagProd
-	}
-	exists, hasErr := hasFn(ctx, tag)
-	if hasErr != nil {
-		log.Printf("local-build: image inspect for %s failed (%v); will rebuild", tag, hasErr)
-	}
-	if exists {
-		log.Printf("local-build: cache hit for %s (sha=%s) — skipping clone+build", tag, sha[:12])
-		// Refresh the floating :latest alias so admins inspecting `docker
-		// images` see the current sha. Best-effort.
-		tagFn := opts.dockerTag
-		if tagFn == nil {
-			tagFn = dockerTagProd
-		}
-		if tErr := tagFn(ctx, tag, latest); tErr != nil {
-			log.Printf("local-build: best-effort retag of %s → %s failed: %v", tag, latest, tErr)
-		}
-		return tag, nil
-	}
-
-	// 3. Cold path — clone + build.
-	dest := filepath.Join(opts.CacheDir, runtime, sha[:12])
-	if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
-		return "", fmt.Errorf("local-build: prepare cache dir %q: %w", filepath.Dir(dest), err)
-	}
-	// Idempotent: if the dest exists from a previous failed run, wipe and
-	// re-clone so we don't build a partial tree.
-	if _, statErr := os.Stat(dest); statErr == nil {
-		if rmErr := os.RemoveAll(dest); rmErr != nil {
-			return "", fmt.Errorf("local-build: clean stale cache dir %q: %w", dest, rmErr)
-		}
-	}
-
-	cloneFn := opts.gitClone
-	if cloneFn == nil {
-		cloneFn = gitCloneProd
-	}
-	log.Printf("local-build: cloning %s → %s (sha=%s)", redactedRepoURL(opts, runtime), dest, sha[:12])
-	cloneStart := time.Now()
-	if err := cloneFn(ctx, opts, runtime, dest); err != nil {
-		// Best-effort cleanup so a half-cloned tree doesn't poison future runs.
-		_ = os.RemoveAll(dest)
-		return "", fmt.Errorf("local-build: clone %s: %w", redactedRepoURL(opts, runtime), err)
-	}
-	log.Printf("local-build: clone complete in %s", time.Since(cloneStart).Round(time.Millisecond))
-
-	// 4. Sanity-check the cloned tree contains a Dockerfile at the root.
-	dockerfile := filepath.Join(dest, "Dockerfile")
-	info, statErr := os.Stat(dockerfile)
-	if statErr != nil || info.IsDir() {
-		_ = os.RemoveAll(dest)
-		return "", fmt.Errorf("local-build: cloned tree at %s has no Dockerfile (template repo malformed)", dest)
-	}
-
-	// 5. Build.
-	buildFn := opts.dockerBuild
-	if buildFn == nil {
-		buildFn = dockerBuildProd
-	}
-	log.Printf("local-build: docker build start for %s (platform=%s, context=%s)", tag, opts.Platform, dest)
-	buildStart := time.Now()
-	if err := buildFn(ctx, opts, dest, tag); err != nil {
-		return "", fmt.Errorf("local-build: docker build %s: %w", tag, err)
-	}
-	log.Printf("local-build: docker build done for %s in %s", tag, time.Since(buildStart).Round(time.Second))
-
-	// Tag :latest as a friendly alias.
-	tagFn := opts.dockerTag
-	if tagFn == nil {
-		tagFn = dockerTagProd
-	}
-	if err := tagFn(ctx, tag, latest); err != nil {
-		log.Printf("local-build: best-effort retag of %s → %s failed: %v", tag, latest, err)
-	}
-
-	return tag, nil
-}
-
-// repoURL composes the full Gitea repo URL for the given runtime. The
-// prefix is hardcoded by default; operators can override via env so a
-// fork can point local-build at their own Gitea instance.
-func repoURL(opts *LocalBuildOptions, runtime string) string {
-	return opts.RepoPrefix + runtime
-}
-
-// redactedRepoURL returns the same value with any embedded token replaced
-// by "***". Use this for log lines.
-func redactedRepoURL(opts *LocalBuildOptions, runtime string) string {
-	return maskTokenInURL(repoURL(opts, runtime))
-}
-
-// maskTokenInURL replaces userinfo (username:password@) in a URL with
-// `***@` so log lines never echo a Gitea PAT. Returns the input as-is
-// on parse failures (defence: never silently corrupt the visible URL).
-//
-// Implementation note: net/url's URL.User stringifier percent-encodes
-// the username, so `u.User = url.User("***"); u.String()` would yield
-// `https://%2A%2A%2A@host/...` — unhelpful for humans grepping logs.
-// We drop the userinfo via URL.User=nil, get the canonical scheme-and-
-// rest, and re-insert the literal `***@` between the scheme separator
-// and the host.
-func maskTokenInURL(s string) string {
-	u, err := url.Parse(s)
-	if err != nil || u.User == nil {
-		return s
-	}
-	u.User = nil
-	out := u.String()
-	prefix := u.Scheme + "://"
-	if !strings.HasPrefix(out, prefix) {
-		return s
-	}
-	return prefix + "***@" + out[len(prefix):]
-}
-
-// remoteHeadShaProd looks up the HEAD commit sha of branch `main` for
-// the workspace-template-<runtime> repo on Gitea. We use the Gitea API
-// (a single HTTPS call) rather than `git ls-remote` so we don't need a
-// git binary just for the HEAD lookup — we still need git for the
-// clone, but the cache-hit path stays git-free.
-func remoteHeadShaProd(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
-	// Convert a `git.example.com/org/prefix-` URL into the API form
-	// `git.example.com/api/v1/repos/org/prefix-<runtime>/branches/main`.
-	// Works for both git.moleculesai.app (default) and any forks that
-	// share the Gitea API shape.
-	apiURL, err := giteaBranchAPIURL(opts.RepoPrefix, runtime, "main")
-	if err != nil {
-		return "", err
-	}
-	req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
-	if err != nil {
-		return "", err
-	}
-	if opts.Token != "" {
-		// Gitea accepts "token <PAT>" in the Authorization header for
-		// API calls. Userinfo is also accepted but only matters for
-		// the HTTPS clone, not the JSON API.
-		req.Header.Set("Authorization", "token "+opts.Token)
-	}
-	cli := opts.HTTPClient
-	if cli == nil {
-		cli = &http.Client{Timeout: 30 * time.Second}
-	}
-	resp, err := cli.Do(req)
-	if err != nil {
-		return "", err
-	}
-	defer func() { _ = resp.Body.Close() }()
-	if resp.StatusCode == http.StatusNotFound {
-		return "", fmt.Errorf("repo not found at %s — runtime %q may not be mirrored to Gitea (only claude-code/hermes/langgraph/autogen today)", apiURL, runtime)
-	}
-	if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
-		return "", fmt.Errorf("auth failure (%d) at %s — verify MOLECULE_GITEA_TOKEN if private repo", resp.StatusCode, apiURL)
-	}
-	if resp.StatusCode != http.StatusOK {
-		return "", fmt.Errorf("HEAD lookup at %s returned %d", apiURL, resp.StatusCode)
-	}
-	body, err := io.ReadAll(io.LimitReader(resp.Body, 64<<10))
-	if err != nil {
-		return "", fmt.Errorf("read HEAD response body: %w", err)
-	}
-	// Tiny ad-hoc parser: we want commit.id, no need to drag in encoding/json
-	// — actually simpler to use json. Switch to it.
-	return parseGiteaBranchHeadSha(body)
-}
-
-// giteaBranchAPIURL maps a repo-prefix URL like
-// `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-`
-// + runtime "claude-code" + branch "main"
-// to the API URL
-// `https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-ai-workspace-template-claude-code/branches/main`.
-func giteaBranchAPIURL(repoPrefix, runtime, branch string) (string, error) {
-	u, err := url.Parse(repoPrefix + runtime)
-	if err != nil {
-		return "", fmt.Errorf("parse repo URL %q: %w", repoPrefix+runtime, err)
-	}
-	parts := strings.TrimPrefix(u.Path, "/")
-	parts = strings.TrimSuffix(parts, "/")
-	if parts == "" {
-		return "", fmt.Errorf("repo URL %q has empty path", repoPrefix+runtime)
-	}
-	// Expect `<org>/<repo>` (single slash) — the prefix already includes
-	// org+partial-repo; runtime appends the rest.
-	if !strings.Contains(parts, "/") {
-		return "", fmt.Errorf("repo URL %q missing org/repo path", repoPrefix+runtime)
-	}
-	apiURL := url.URL{
-		Scheme: u.Scheme,
-		Host:   u.Host,
-		Path:   "/api/v1/repos/" + parts + "/branches/" + branch,
-	}
-	return apiURL.String(), nil
-}
-
-// parseGiteaBranchHeadSha extracts commit.id from the Gitea
-// /branches/<name> response. We use a permissive substring scan so a
-// missing-key in the JSON gives a clear error rather than the
-// json.Decoder's somewhat opaque "missing field" message.
-func parseGiteaBranchHeadSha(body []byte) (string, error) {
-	// Look for `"id":"<40-hex>"` inside the commit object.
-	idx := strings.Index(string(body), `"id":"`)
-	if idx < 0 {
-		return "", errors.New("Gitea branch response missing commit.id field")
-	}
-	rest := string(body[idx+len(`"id":"`):])
-	end := strings.IndexByte(rest, '"')
-	if end < 0 {
-		return "", errors.New("Gitea branch response has malformed commit.id (no closing quote)")
-	}
-	sha := rest[:end]
-	if len(sha) < 7 {
-		return "", fmt.Errorf("Gitea returned suspiciously short sha %q", sha)
-	}
-	return sha, nil
-}
-
-// gitCloneProd shallow-clones the runtime's template repo into dest.
-//
-// We invoke `git` rather than implementing the protocol ourselves —
-// every host that runs the workspace-server already needs git available
-// (it's a hard dep of go-mod for vendored repos) and the OSS contributor
-// onboarding doc lists it as a prerequisite.
-func gitCloneProd(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
-	cloneURL := repoURL(opts, runtime)
-	if opts.Token != "" {
-		// HTTPS clone with userinfo: https://oauth2:<token>@host/...
-		u, err := url.Parse(cloneURL)
-		if err == nil {
-			u.User = url.UserPassword("oauth2", opts.Token)
-			cloneURL = u.String()
-		}
-		// On parse failure we silently fall through to the public URL —
-		// better to attempt the anonymous clone than to refuse outright.
-	}
-	cmd := exec.CommandContext(ctx, "git", "clone", "--depth=1", "--branch=main", "--single-branch", cloneURL, dest)
-	// Drop git's askpass prompts so we fail-fast on auth errors instead
-	// of hanging waiting for an interactive password.
-	cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0", "GIT_ASKPASS=/bin/echo")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		// Mask the token in any error string git emits via stderr — git
-		// occasionally echoes the URL verbatim on failure.
-		errMsg := maskTokenInString(string(out), opts.Token)
-		return fmt.Errorf("%w: %s", err, strings.TrimSpace(errMsg))
-	}
-	return nil
-}
-
-// maskTokenInString replaces literal occurrences of the token with `***`.
-// Defence against git binary or docker echoing the URL into stderr.
-func maskTokenInString(s, token string) string {
-	if token == "" {
-		return s
-	}
-	return strings.ReplaceAll(s, token, "***")
-}
-
-// dockerBuildProd invokes the docker CLI to build the workspace-template
-// image. We shell out rather than use the Docker SDK's ImageBuild — the
-// SDK requires hand-tarballing the build context, which adds a
-// non-trivial code path with its own bug surface. The docker CLI is
-// already a hard dep of the workspace-server (the provisioner needs the
-// daemon), so requiring the CLI binary on PATH adds nothing.
-//
-// Uses the legacy `docker build` (not `docker buildx build`) because
-// buildx isn't always installed by default on Linux distros and the
-// legacy builder produces an image the local Docker daemon picks up
-// automatically. We pass --platform=linux/amd64 directly; with Docker
-// 20.10+ this works without buildx because the legacy builder
-// auto-promotes to BuildKit when available, falling back to v1
-// otherwise (still produces an amd64 image via QEMU).
-func dockerBuildProd(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
-	args := []string{"build"}
-	if opts.Platform != "" {
-		args = append(args, "--platform="+opts.Platform)
-	}
-	args = append(args,
-		"-t", tag,
-		"-f", filepath.Join(contextDir, "Dockerfile"),
-		contextDir,
-	)
-	cmd := exec.CommandContext(ctx, "docker", args...)
-	cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1")
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		// Sanitize defensive — docker build output shouldn't contain a
-		// token, but maskTokenInString is a no-op when token is empty.
-		return fmt.Errorf("%w: %s", err, strings.TrimSpace(maskTokenInString(string(out), opts.Token)))
-	}
-	return nil
-}
-
-// dockerHasTagProd returns true iff the given tag exists in the local
-// image store. Used as the fast cache-hit check.
-func dockerHasTagProd(ctx context.Context, tag string) (bool, error) {
-	cmd := exec.CommandContext(ctx, "docker", "image", "inspect", "--format={{.Id}}", tag)
-	out, err := cmd.CombinedOutput()
-	if err == nil {
-		return strings.TrimSpace(string(out)) != "", nil
-	}
-	// `docker image inspect` exits 1 with "Error: No such image" when
-	// missing — that's a definitive false, not an error condition.
-	low := strings.ToLower(string(out))
-	if strings.Contains(low, "no such image") || strings.Contains(low, "not found") {
-		return false, nil
-	}
-	return false, fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out)))
-}
-
-// dockerTagProd creates an alias from src → dst. Used to refresh the
-// floating `:latest` after a build or cache hit.
-func dockerTagProd(ctx context.Context, src, dst string) error {
-	cmd := exec.CommandContext(ctx, "docker", "tag", src, dst)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out)))
-	}
-	return nil
-}
-
-// CacheKey is exposed for diagnostic logs / tests so the cache-key shape
-// is documented in code rather than only as a string format.
-//
-//	cache_key = sha256(runtime || head_sha || repoPrefix)[:16]
-//
-// Today only the SHA is consumed, but the helper is kept for future
-// extensions (e.g. include Dockerfile-content-hash to invalidate when
-// only the Dockerfile changes between two runs targeting the same SHA).
-func CacheKey(runtime, sha, repoPrefix string) string {
-	h := sha256.Sum256([]byte(runtime + "|" + sha + "|" + repoPrefix))
-	return hex.EncodeToString(h[:8])
-}
@@ -1,662 +0,0 @@
-package provisioner
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"path/filepath"
-	"strings"
-	"sync"
-	"testing"
-)
-
-// makeTestOpts produces a LocalBuildOptions where every external seam
-// (Gitea HEAD, git clone, docker build/has/tag) is replaced by a stub.
-// Tests override the stub for the behavior they want to assert.
-func makeTestOpts(t *testing.T) *LocalBuildOptions {
-	t.Helper()
-	tmp := t.TempDir()
-	return &LocalBuildOptions{
-		CacheDir:   tmp,
-		RepoPrefix: "https://git.test/molecule-ai/molecule-ai-workspace-template-",
-		Platform:   "linux/amd64",
-		HTTPClient: &http.Client{},
-		remoteHeadSha: func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
-			return "abcdef0123456789abcdef0123456789abcdef01", nil
-		},
-		gitClone: func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
-			// Write a fake Dockerfile so the sanity-check passes.
-			if err := os.MkdirAll(dest, 0o755); err != nil {
-				return err
-			}
-			return os.WriteFile(filepath.Join(dest, "Dockerfile"), []byte("FROM scratch\n"), 0o644)
-		},
-		dockerBuild: func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
-			return nil
-		},
-		dockerHasTag: func(ctx context.Context, tag string) (bool, error) {
-			return false, nil
-		},
-		dockerTag: func(ctx context.Context, src, dst string) error {
-			return nil
-		},
-	}
-}
-
-// TestEnsureLocalImage_Success — happy path: HEAD lookup succeeds, no
-// cache hit, clone + build run, returned tag is SHA-pinned.
-func TestEnsureLocalImage_Success(t *testing.T) {
-	opts := makeTestOpts(t)
-	tag, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	want := "molecule-local/workspace-template-claude-code:abcdef012345"
-	if tag != want {
-		t.Errorf("tag = %q, want %q", tag, want)
-	}
-}
-
-// TestEnsureLocalImage_CacheHit — second call with a cached image must
-// skip clone + build entirely.
-func TestEnsureLocalImage_CacheHit(t *testing.T) {
-	opts := makeTestOpts(t)
-	var cloneCount, buildCount int
-	opts.gitClone = func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
-		cloneCount++
-		return os.WriteFile(filepath.Join(dest, "Dockerfile"), []byte("FROM scratch\n"), 0o644)
-	}
-	opts.dockerBuild = func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
-		buildCount++
-		return nil
-	}
-	opts.dockerHasTag = func(ctx context.Context, tag string) (bool, error) {
-		return true, nil // cached
-	}
-	if _, err := ensureLocalImageWithOpts(context.Background(), "hermes", opts); err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if cloneCount != 0 {
-		t.Errorf("cache hit triggered %d clones, want 0", cloneCount)
-	}
-	if buildCount != 0 {
-		t.Errorf("cache hit triggered %d builds, want 0", buildCount)
-	}
-}
-
-// TestEnsureLocalImage_UnknownRuntime — the allowlist guard rejects
-// arbitrary runtime names before any network or filesystem call.
-func TestEnsureLocalImage_UnknownRuntime(t *testing.T) {
-	opts := makeTestOpts(t)
-	for _, bad := range []string{
-		"", "unknown", "../../../etc/passwd", "claude-code; rm -rf /",
-	} {
-		t.Run(bad, func(t *testing.T) {
-			_, err := ensureLocalImageWithOpts(context.Background(), bad, opts)
-			if err == nil {
-				t.Errorf("EnsureLocalImage(%q) should fail (not a known runtime)", bad)
-			}
-			if err != nil && !strings.Contains(err.Error(), "unknown runtime") {
-				t.Errorf("error = %v, want one mentioning %q", err, "unknown runtime")
-			}
-		})
-	}
-}
-
-// TestEnsureLocalImage_GiteaUnreachable — fail-closed when the HEAD
-// lookup fails. Must NOT fall back to GHCR/ECR.
-func TestEnsureLocalImage_GiteaUnreachable(t *testing.T) {
-	opts := makeTestOpts(t)
-	opts.remoteHeadSha = func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
-		return "", errors.New("dial tcp: no such host")
-	}
-	_, err := ensureLocalImageWithOpts(context.Background(), "langgraph", opts)
-	if err == nil {
-		t.Fatalf("expected error, got nil")
-	}
-	if !strings.Contains(err.Error(), "cannot determine HEAD sha") {
-		t.Errorf("error = %v, want one mentioning HEAD sha lookup", err)
-	}
-	// Critical: error must NOT mention ghcr or ecr (no silent fallback).
-	low := strings.ToLower(err.Error())
-	if strings.Contains(low, "ghcr") || strings.Contains(low, "ecr") {
-		t.Errorf("error message %q must not mention ghcr/ecr (no silent fallback)", err.Error())
-	}
-}
-
-// TestEnsureLocalImage_RepoNotFound — Gitea returned 404. Must surface
-// a runtime-naming error so the OSS contributor can file the right
-// mirroring task.
-func TestEnsureLocalImage_RepoNotFound(t *testing.T) {
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		w.WriteHeader(http.StatusNotFound)
-		_, _ = w.Write([]byte(`{"message":"repo not found"}`))
-	}))
-	defer srv.Close()
-
-	opts := makeTestOpts(t)
-	opts.RepoPrefix = srv.URL + "/molecule-ai/molecule-ai-workspace-template-"
-	opts.HTTPClient = srv.Client()
-	opts.remoteHeadSha = nil // exercise real HTTP path
-
-	_, err := ensureLocalImageWithOpts(context.Background(), "crewai", opts)
-	if err == nil {
-		t.Fatalf("expected error, got nil")
-	}
-	if !strings.Contains(err.Error(), "not mirrored") && !strings.Contains(err.Error(), "not found") {
-		t.Errorf("error = %v, want a missing-repo message", err)
-	}
-}
-
-// TestEnsureLocalImage_AuthFailure — Gitea returned 401/403. Must
-// produce an actionable error (mentions the token env var so an OSS
-// contributor knows what to set).
-func TestEnsureLocalImage_AuthFailure(t *testing.T) {
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		w.WriteHeader(http.StatusUnauthorized)
-	}))
-	defer srv.Close()
-
-	opts := makeTestOpts(t)
-	opts.RepoPrefix = srv.URL + "/molecule-ai/molecule-ai-workspace-template-"
-	opts.HTTPClient = srv.Client()
-	opts.remoteHeadSha = nil
-
-	_, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
-	if err == nil {
-		t.Fatalf("expected error, got nil")
-	}
-	if !strings.Contains(err.Error(), "MOLECULE_GITEA_TOKEN") {
-		t.Errorf("error = %v, want one mentioning MOLECULE_GITEA_TOKEN", err)
-	}
-}
-
-// TestEnsureLocalImage_HeadShaWithRealJSON — exercise the JSON parser
-// against a Gitea-shaped response to catch parse drift.
-func TestEnsureLocalImage_HeadShaWithRealJSON(t *testing.T) {
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		// Real Gitea response shape (truncated for relevance).
-		w.Header().Set("Content-Type", "application/json")
-		_, _ = w.Write([]byte(`{
-			"name":"main",
-			"commit":{
-				"id":"3c849b3ba778abcdef0123456789abcdef012345",
-				"message":"feat: stuff"
-			}
-		}`))
-	}))
-	defer srv.Close()
-
-	opts := makeTestOpts(t)
-	opts.RepoPrefix = srv.URL + "/molecule-ai/molecule-ai-workspace-template-"
-	opts.HTTPClient = srv.Client()
-	opts.remoteHeadSha = nil // exercise real HTTP path
-
-	tag, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if !strings.Contains(tag, "3c849b3ba778") {
-		t.Errorf("tag = %q, want one containing the parsed sha", tag)
-	}
-}
-
-// TestEnsureLocalImage_BuildFailure — surfaces docker-build errors with
-// the build context so an operator can debug locally.
-func TestEnsureLocalImage_BuildFailure(t *testing.T) {
-	opts := makeTestOpts(t)
-	opts.dockerBuild = func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
-		return errors.New("Dockerfile syntax error")
-	}
-	_, err := ensureLocalImageWithOpts(context.Background(), "autogen", opts)
-	if err == nil {
-		t.Fatalf("expected error, got nil")
-	}
-	if !strings.Contains(err.Error(), "docker build") {
-		t.Errorf("error = %v, want one mentioning docker build", err)
-	}
-}
-
-// TestEnsureLocalImage_MissingDockerfile — the cloned tree must contain
-// a Dockerfile at root; absence is a malformed-template-repo error.
-func TestEnsureLocalImage_MissingDockerfile(t *testing.T) {
-	opts := makeTestOpts(t)
-	opts.gitClone = func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
-		// Empty dir, no Dockerfile.
-		return os.MkdirAll(dest, 0o755)
-	}
-	_, err := ensureLocalImageWithOpts(context.Background(), "hermes", opts)
-	if err == nil {
-		t.Fatalf("expected error, got nil")
-	}
-	if !strings.Contains(err.Error(), "no Dockerfile") {
-		t.Errorf("error = %v, want one mentioning missing Dockerfile", err)
-	}
-}
-
-// TestEnsureLocalImage_ConcurrentSameRuntime — two goroutines hitting
-// the same runtime serialize via the per-runtime lock; the build runs
-// once.
-func TestEnsureLocalImage_ConcurrentSameRuntime(t *testing.T) {
-	opts := makeTestOpts(t)
-	var (
-		buildCount int
-		buildMu    sync.Mutex
-	)
-	opts.dockerHasTag = func(ctx context.Context, tag string) (bool, error) {
-		// First call: cache miss. Second call (after first build): hit.
-		buildMu.Lock()
-		defer buildMu.Unlock()
-		return buildCount > 0, nil
-	}
-	opts.dockerBuild = func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
-		buildMu.Lock()
-		buildCount++
-		buildMu.Unlock()
-		return nil
-	}
-
-	const N = 5
-	var wg sync.WaitGroup
-	wg.Add(N)
-	for i := 0; i < N; i++ {
-		go func() {
-			defer wg.Done()
-			_, _ = ensureLocalImageWithOpts(context.Background(), "langgraph", opts)
-		}()
-	}
-	wg.Wait()
-	if buildCount != 1 {
-		t.Errorf("buildCount = %d, want 1 (lock should serialize concurrent calls)", buildCount)
-	}
-}
-
-// TestMaskTokenInURL — Gitea PATs in URLs must NEVER appear in logs.
-func TestMaskTokenInURL(t *testing.T) {
-	cases := []struct {
-		in   string
-		want string
-	}{
-		{"https://oauth2:secret123@git.example.com/foo/bar", "https://***@git.example.com/foo/bar"},
-		{"https://user:tok@host/path", "https://***@host/path"},
-		{"https://no-userinfo.example.com/path", "https://no-userinfo.example.com/path"},
-		{"not a url", "not a url"},
-		{"", ""},
-	}
-	for _, tc := range cases {
-		t.Run(tc.in, func(t *testing.T) {
-			got := maskTokenInURL(tc.in)
-			if got != tc.want {
-				t.Errorf("maskTokenInURL(%q) = %q, want %q", tc.in, got, tc.want)
-			}
-		})
-	}
-}
-
-// TestMaskTokenInString — defence against git/docker echoing the token
-// into stderr on failure.
-func TestMaskTokenInString(t *testing.T) {
-	got := maskTokenInString("error: clone https://oauth2:abc123@git.test/foo: failed", "abc123")
-	if strings.Contains(got, "abc123") {
-		t.Errorf("masked string %q still contains the token", got)
-	}
-	if !strings.Contains(got, "***") {
-		t.Errorf("masked string %q should have *** in place of token", got)
-	}
-	// No-op when token is empty.
-	if got := maskTokenInString("hello world", ""); got != "hello world" {
-		t.Errorf("empty token must not modify string, got %q", got)
-	}
-}
-
-// TestGiteaBranchAPIURL — the URL composer must produce the canonical
-// /api/v1/repos/<org>/<repo>/branches/<branch> shape.
-func TestGiteaBranchAPIURL(t *testing.T) {
-	cases := []struct {
-		prefix, runtime, branch, want string
-	}{
-		{
-			"https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-",
-			"claude-code",
-			"main",
-			"https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-ai-workspace-template-claude-code/branches/main",
-		},
-		{
-			"http://localhost:3000/myorg/template-",
-			"foo",
-			"main",
-			"http://localhost:3000/api/v1/repos/myorg/template-foo/branches/main",
-		},
-	}
-	for _, tc := range cases {
-		t.Run(tc.runtime, func(t *testing.T) {
-			got, err := giteaBranchAPIURL(tc.prefix, tc.runtime, tc.branch)
-			if err != nil {
-				t.Fatalf("err = %v", err)
-			}
-			if got != tc.want {
-				t.Errorf("got %q, want %q", got, tc.want)
-			}
-		})
-	}
-}
-
-// TestGiteaBranchAPIURL_RejectsMalformed — malformed prefixes (no org
-// path) produce an error rather than a malformed API call.
-func TestGiteaBranchAPIURL_RejectsMalformed(t *testing.T) {
-	for _, bad := range []string{
-		"https://example.com/", // no path component
-		"://broken",
-	} {
-		t.Run(bad, func(t *testing.T) {
-			if _, err := giteaBranchAPIURL(bad, "claude-code", "main"); err == nil {
-				t.Errorf("expected error for malformed prefix %q", bad)
-			}
-		})
-	}
-}
-
-// TestParseGiteaBranchHeadSha — pin the parser against representative
-// Gitea responses so a future Gitea API rev that adds fields doesn't
-// silently break detection.
-func TestParseGiteaBranchHeadSha(t *testing.T) {
-	good := []byte(`{"name":"main","commit":{"id":"abc123def456","message":"hi"}}`)
-	got, err := parseGiteaBranchHeadSha(good)
-	if err != nil {
-		t.Fatalf("err = %v", err)
-	}
-	if got != "abc123def456" {
-		t.Errorf("got %q, want abc123def456", got)
-	}
-
-	for _, bad := range [][]byte{
-		[]byte(`{}`),
-		[]byte(`{"name":"main","commit":{}}`),
-		[]byte(`{"commit":{"id":"`), // truncated
-		[]byte(`<html>404</html>`),
-	} {
-		if _, err := parseGiteaBranchHeadSha(bad); err == nil {
-			t.Errorf("expected error for malformed body %q", string(bad))
-		}
-	}
-}
-
-// TestLocalImageTag_ShortSha — caller-supplied SHA gets truncated to
-// 12 chars in the tag so `docker images` output stays readable.
-func TestLocalImageTag_ShortSha(t *testing.T) {
-	got := LocalImageTag("claude-code", "abcdef0123456789abcdef0123456789abcdef01")
-	want := "molecule-local/workspace-template-claude-code:abcdef012345"
-	if got != want {
-		t.Errorf("got %q, want %q", got, want)
-	}
-}
-
-// TestLocalImageLatestTag — the floating alias used as the human-readable
-// :latest entry.
-func TestLocalImageLatestTag(t *testing.T) {
-	got := LocalImageLatestTag("hermes")
-	want := "molecule-local/workspace-template-hermes:latest"
-	if got != want {
-		t.Errorf("got %q, want %q", got, want)
-	}
-}
-
-// TestRemoteHeadShaProd_IncludesAuthHeader — when a token is configured,
-// the API request must carry the `Authorization: token <pat>` header.
-func TestRemoteHeadShaProd_IncludesAuthHeader(t *testing.T) {
-	var got string
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		got = r.Header.Get("Authorization")
-		w.Header().Set("Content-Type", "application/json")
-		_, _ = w.Write([]byte(`{"commit":{"id":"deadbeef0000aaaa1111bbbb2222cccc33334444"}}`))
-	}))
-	defer srv.Close()
-
-	opts := makeTestOpts(t)
-	opts.RepoPrefix = srv.URL + "/myorg/template-"
-	opts.HTTPClient = srv.Client()
-	opts.Token = "secret-pat-do-not-log"
-
-	if _, err := remoteHeadShaProd(context.Background(), opts, "claude-code"); err != nil {
-		t.Fatalf("err = %v", err)
-	}
-	if got != "token secret-pat-do-not-log" {
-		t.Errorf("Authorization header = %q, want %q", got, "token secret-pat-do-not-log")
-	}
-}
-
-// TestCacheKey_Stable — the helper must be deterministic and incorporate
-// each input.
-func TestCacheKey_Stable(t *testing.T) {
-	a := CacheKey("claude-code", "abc", "https://git/")
-	b := CacheKey("claude-code", "abc", "https://git/")
-	if a != b {
-		t.Errorf("CacheKey is non-deterministic: %q vs %q", a, b)
-	}
-	if a == CacheKey("claude-code", "def", "https://git/") {
-		t.Errorf("CacheKey ignores sha")
-	}
-	if a == CacheKey("hermes", "abc", "https://git/") {
-		t.Errorf("CacheKey ignores runtime")
-	}
-}
-
-// TestRedactedRepoURL_NoToken — a repo URL with no embedded credential
-// is unmodified.
-func TestRedactedRepoURL_NoToken(t *testing.T) {
-	opts := &LocalBuildOptions{RepoPrefix: "https://git.example.com/org/template-"}
-	got := redactedRepoURL(opts, "claude-code")
-	want := "https://git.example.com/org/template-claude-code"
-	if got != want {
-		t.Errorf("got %q, want %q", got, want)
-	}
-}
-
-// TestRepoURL_AppendsRuntime — the prefix + runtime composer is stable.
-func TestRepoURL_AppendsRuntime(t *testing.T) {
-	opts := &LocalBuildOptions{RepoPrefix: "https://git.example.com/org/template-"}
-	got := repoURL(opts, "claude-code")
-	if got != "https://git.example.com/org/template-claude-code" {
-		t.Errorf("got %q", got)
-	}
-}
-
-// TestNewDefaultLocalBuildOptions_RespectsEnvOverrides — the env var
-// overrides documented in the runbook actually take effect.
-func TestNewDefaultLocalBuildOptions_RespectsEnvOverrides(t *testing.T) {
-	t.Setenv("MOLECULE_LOCAL_BUILD_CACHE", "/var/tmp/molecule-test")
-	t.Setenv("MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX", "https://my.fork/org/tpl-")
-	t.Setenv("MOLECULE_GITEA_TOKEN", "tok-from-env")
-
-	opts := newDefaultLocalBuildOptions()
-	if opts.CacheDir != "/var/tmp/molecule-test" {
-		t.Errorf("CacheDir = %q", opts.CacheDir)
-	}
-	if opts.RepoPrefix != "https://my.fork/org/tpl-" {
-		t.Errorf("RepoPrefix = %q", opts.RepoPrefix)
-	}
-	if opts.Token != "tok-from-env" {
-		t.Errorf("Token = %q", opts.Token)
-	}
-	if opts.Platform != "linux/amd64" {
-		t.Errorf("Platform = %q, want linux/amd64", opts.Platform)
-	}
-}
-
-// TestNewDefaultLocalBuildOptions_DefaultCacheDir — XDG-compliant
-// fallback when nothing is overridden.
-func TestNewDefaultLocalBuildOptions_DefaultCacheDir(t *testing.T) {
-	t.Setenv("MOLECULE_LOCAL_BUILD_CACHE", "")
-	t.Setenv("XDG_CACHE_HOME", "")
-	t.Setenv("MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX", "")
-
-	opts := newDefaultLocalBuildOptions()
-	if !strings.Contains(opts.CacheDir, ".cache") && !strings.Contains(opts.CacheDir, "molecule") {
-		t.Errorf("CacheDir = %q, want one under .cache/molecule", opts.CacheDir)
-	}
-	if opts.RepoPrefix != gitTemplateRepoPrefix {
-		t.Errorf("RepoPrefix = %q, want default %q", opts.RepoPrefix, gitTemplateRepoPrefix)
-	}
-}
-
-// TestEnsureLocalImage_ShortSha — a remote that returns a too-short
-// sha is rejected (defence against a misbehaving Gitea proxy).
-func TestEnsureLocalImage_ShortSha(t *testing.T) {
-	opts := makeTestOpts(t)
-	opts.remoteHeadSha = func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
-		return "abc", nil
-	}
-	_, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
-	if err == nil {
-		t.Fatalf("expected error for short sha")
-	}
-	if !strings.Contains(err.Error(), "short sha") {
-		t.Errorf("error = %v, want short-sha message", err)
-	}
-}
-
-// TestEnsureLocalImage_StaleCacheDirCleaned — a partial clone left over
-// from a previous failed run must not poison the next attempt.
-func TestEnsureLocalImage_StaleCacheDirCleaned(t *testing.T) {
-	opts := makeTestOpts(t)
-	// Pre-create a stale dir at the cache target (with a partial Dockerfile).
-	staleDir := filepath.Join(opts.CacheDir, "claude-code", "abcdef012345")
-	if err := os.MkdirAll(staleDir, 0o755); err != nil {
-		t.Fatalf("setup: %v", err)
-	}
-	if err := os.WriteFile(filepath.Join(staleDir, "stale-marker"), []byte("delete me"), 0o644); err != nil {
-		t.Fatalf("setup: %v", err)
-	}
-	if _, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts); err != nil {
-		t.Fatalf("err = %v", err)
-	}
-	if _, err := os.Stat(filepath.Join(staleDir, "stale-marker")); !os.IsNotExist(err) {
-		t.Errorf("stale-marker should have been wiped before re-clone (err=%v)", err)
-	}
-	// Dockerfile from the new clone should be present.
-	if _, err := os.Stat(filepath.Join(staleDir, "Dockerfile")); err != nil {
-		t.Errorf("expected Dockerfile from re-clone, got err=%v", err)
-	}
-}
-
-// TestEnsureLocalImage_ContextCancelled — context cancellation
-// propagates to the network/clone seams (best-effort: the test asserts
-// that no work happens after Done()).
-func TestEnsureLocalImage_ContextCancelled(t *testing.T) {
-	ctx, cancel := context.WithCancel(context.Background())
-	cancel()
-
-	opts := makeTestOpts(t)
-	opts.remoteHeadSha = func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
-		if err := ctx.Err(); err != nil {
-			return "", err
-		}
-		return "deadbeef00000000aaaa1111bbbb2222cccc33334444", nil
-	}
-
-	_, err := ensureLocalImageWithOpts(ctx, "claude-code", opts)
-	if err == nil {
-		t.Fatalf("expected error from cancelled context")
-	}
-}
-
-// TestEnsureLocalImage_RetagAfterCacheHit — a cache-hit must refresh
-// the floating :latest alias so admins inspecting `docker images` see
-// the current SHA.
-func TestEnsureLocalImage_RetagAfterCacheHit(t *testing.T) {
-	opts := makeTestOpts(t)
-	var src, dst string
-	opts.dockerHasTag = func(ctx context.Context, tag string) (bool, error) { return true, nil }
-	opts.dockerTag = func(ctx context.Context, s, d string) error {
-		src, dst = s, d
-		return nil
-	}
-	tag, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
-	if err != nil {
-		t.Fatalf("err = %v", err)
-	}
-	if src != tag {
-		t.Errorf("retag src = %q, want %q", src, tag)
-	}
-	wantDst := "molecule-local/workspace-template-claude-code:latest"
-	if dst != wantDst {
-		t.Errorf("retag dst = %q, want %q", dst, wantDst)
-	}
-}
-
-// TestRemoteHeadShaProd_BodyOverflow — defence against a malicious or
-// misbehaving Gitea returning a multi-MB body.
-func TestRemoteHeadShaProd_BodyOverflow(t *testing.T) {
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		w.Header().Set("Content-Type", "application/json")
-		// Stream a 100MB body. The reader should cap at 64KB and yield
-		// a parse error rather than OOM.
-		_, _ = w.Write([]byte(`{"commit":{"id":"`))
-		_, _ = w.Write([]byte(strings.Repeat("a", 64<<10))) // 64KB of 'a'
-		// Connection drops here; we don't write the closing quote.
-	}))
-	defer srv.Close()
-
-	opts := makeTestOpts(t)
-	opts.RepoPrefix = srv.URL + "/myorg/template-"
-	opts.HTTPClient = srv.Client()
-
-	_, err := remoteHeadShaProd(context.Background(), opts, "claude-code")
-	if err == nil {
-		t.Fatalf("expected error from over-long sha (no closing quote within cap)")
-	}
-}
-
-// TestProvisionerStartUsesLocalBuild_LocalMode — pin the provisioner→
-// local-build wiring at the integration boundary. We don't want a future
-// refactor to silently bypass EnsureLocalImage when registry is unset.
-//
-// This test inspects the mode-decision logic without standing up Docker.
-func TestProvisionerStartUsesLocalBuild_LocalMode(t *testing.T) {
-	t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
-	src := Resolve()
-	if src.Mode != RegistryModeLocal {
-		t.Fatalf("Resolve in unset env = %q, want local", src.Mode)
-	}
-	// The provisioner Start() branches on this same Resolve() call before
-	// reaching ContainerCreate. Pinning the boolean here means a refactor
-	// that flips the sense (e.g. `if src.Mode == RegistryModeSaaS`) is
-	// caught by this test.
-}
-
-// TestEnsureLocalImageHook_DefaultIsRealFunction — pin that the
-// production hook points at EnsureLocalImage. Tests that swap the hook
-// must restore it via t.Cleanup; this test catches a leaked override.
-func TestEnsureLocalImageHook_DefaultIsRealFunction(t *testing.T) {
-	// Sanity: hook is set to a non-nil function. We can't compare
-	// function pointers directly with == in Go (compiler error), so
-	// we exercise it instead — but we don't want to actually clone
-	// from the network in the unit test, so use an unknown runtime
-	// and assert the known-error path runs.
-	_, err := ensureLocalImageHook(context.Background(), "this-runtime-cannot-exist-194")
-	if err == nil {
-		t.Fatalf("expected error from EnsureLocalImage on unknown runtime")
-	}
-	if !strings.Contains(err.Error(), "unknown runtime") {
-		t.Errorf("hook = unexpected function (got error %q, want one mentioning unknown runtime)", err.Error())
-	}
-}
-
-// TestProvisionerStartUsesLocalBuild_SaaSMode — and the symmetric guard:
-// in SaaS-mode, no local-build path runs.
-func TestProvisionerStartUsesLocalBuild_SaaSMode(t *testing.T) {
-	t.Setenv("MOLECULE_IMAGE_REGISTRY", "registry.example.com/molecule-ai")
-	src := Resolve()
-	if src.Mode != RegistryModeSaaS {
-		t.Fatalf("Resolve with registry set = %q, want saas", src.Mode)
-	}
-	if src.Prefix != "registry.example.com/molecule-ai" {
-		t.Fatalf("Prefix = %q", src.Prefix)
-	}
-}
-
-// silence unused warning if we ever drop fmt usage
-var _ = fmt.Sprintf
@@ -320,26 +320,6 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e

 	image := selectImage(cfg)

-	// Local-build mode (issue #63 / Task #194): when MOLECULE_IMAGE_REGISTRY
-	// is unset, the OSS contributor path skips the registry pull entirely
-	// and instead clones the workspace-template-<runtime> repo from Gitea
-	// + `docker build`s it locally. Replace the placeholder image ref with
-	// the SHA-pinned tag of the freshly-built image before ContainerCreate.
-	//
-	// Pinned overrides (cfg.Image set, e.g. via runtime_image_pins for
-	// production thin-AMI launches) bypass this path — they pin a digest
-	// the operator chose explicitly.
-	if cfg.Image == "" && cfg.Runtime != "" {
-		if src := Resolve(); src.Mode == RegistryModeLocal {
-			builtTag, buildErr := ensureLocalImageHook(ctx, cfg.Runtime)
-			if buildErr != nil {
-				return "", fmt.Errorf("local-build mode: ensure image for runtime %q: %w", cfg.Runtime, buildErr)
-			}
-			image = builtTag
-			log.Printf("Provisioner: local-build mode → using locally-built image %s for runtime %s", image, cfg.Runtime)
-		}
-	}
-
 	containerCfg := &container.Config{
 		Image:  image,
 		Env:    env,
@@ -1093,53 +1073,18 @@ func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool,
 	if p == nil || p.cli == nil {
 		return false, ErrNoBackend
 	}
-	name, err := RunningContainerName(ctx, p.cli, workspaceID)
+	name := ContainerName(workspaceID)
+	info, err := p.cli.ContainerInspect(ctx, name)
 	if err != nil {
+		if isContainerNotFound(err) {
+			return false, nil
+		}
 		// Transient daemon error: caller treats !running as dead + restarts.
 		// Returning true + the underlying error preserves the error for
 		// metrics/logging without triggering the destructive path.
 		return true, err
 	}
-	return name != "", nil
-}
-
-// RunningContainerName returns the container name for workspaceID iff the
-// container exists AND is in the Running state. Single source of truth for
-// "what live container should I exec into for this workspace?" — used by
-// both Provisioner.IsRunning (healthsweep) and the plugins handler.
-//
-// Distinguishes three outcomes so callers can pick their own policy:
-//
-//   - ("ws-<id>", nil): container is running. Caller can exec into it.
-//   - ("",        nil): container does not exist OR exists but is stopped
-//                       (NotFound, Exited, Created, Restarting…). Caller
-//                       should treat as a definitive "not running."
-//   - ("",        err): transient daemon error (timeout, socket EOF, ctx
-//                       cancel). Caller should NOT infer "not running" —
-//                       this could be a flaky daemon under load. Decide
-//                       per-callsite whether to fail soft or hard.
-//
-// Background — molecule-core#10: the plugins handler used to carry its own
-// copy of this inspect logic (`findRunningContainer`) which collapsed
-// transient errors into the same "" return as a genuinely-stopped container.
-// That hid daemon flakes as misleading 503 "container not running" responses
-// AND let the two impls drift on edge-case behavior. This is the SSOT.
-func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID string) (string, error) {
-	if cli == nil {
-		return "", ErrNoBackend
-	}
-	name := ContainerName(workspaceID)
-	info, err := cli.ContainerInspect(ctx, name)
-	if err != nil {
-		if isContainerNotFound(err) {
-			return "", nil
-		}
-		return "", err
-	}
-	if info.State.Running {
-		return name, nil
-	}
-	return "", nil
+	return info.State.Running, nil
 }

 // isContainerNotFound returns true when the Docker client indicates the
@@ -1,96 +0,0 @@
-package provisioner
-
-import "os"
-
-// localImagePrefix is the synthetic registry hostname used for images
-// that the local-build path produces. It is intentionally NOT a real
-// hostname — Docker won't try to pull it from the network (no DNS
-// resolution path), and the workspace-image-refresh / image-watch
-// paths short-circuit on it.
-//
-// Tag scheme: `molecule-local/workspace-template-<runtime>:<tag>` where
-// `<tag>` is either the 12-char Gitea HEAD sha for SHA-pinned references
-// or the moving `:latest` for human inspection (the provisioner
-// consumes the SHA-pinned form via EnsureLocalImage()).
-//
-// Issue #63 / Task #194.
-const localImagePrefix = "molecule-local"
-
-// RegistryMode classifies how the provisioner sources workspace-template
-// container images. The two modes are mutually exclusive and selected
-// by presence/absence of the MOLECULE_IMAGE_REGISTRY env var (Q2 design
-// lock, 2026-05-07): set ⇒ SaaS-mode pull; unset ⇒ local-build mode.
-//
-// Discriminated value rather than a bare string return so every call
-// site that decides on image source has to acknowledge the two modes —
-// a bare string returning `""` on local-mode would silently produce
-// malformed image refs (e.g. `/workspace-template-foo:latest`).
-type RegistryMode string
-
-const (
-	// RegistryModeSaaS — pull workspace-template-* images from a real
-	// container registry whose URL is in `MOLECULE_IMAGE_REGISTRY`.
-	// Used by every prod tenant (env injected via Railway / EC2
-	// user-data) and any self-hosted operator who has mirrored the
-	// images to their own GHCR/ECR/Harbor.
-	RegistryModeSaaS RegistryMode = "saas"
-
-	// RegistryModeLocal — clone the workspace-template-<runtime> repo
-	// from Gitea
-	// (`https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>`)
-	// and `docker build` the image locally. Used by OSS contributors
-	// who run `go run ./workspace-server/cmd/server` without setting
-	// MOLECULE_IMAGE_REGISTRY. Closes the post-2026-05-06 GHCR-403 gap
-	// (Task #194 / Issue #63).
-	RegistryModeLocal RegistryMode = "local"
-)
-
-// RegistrySource is the SSOT for image-resolution decisions. Returned
-// by Resolve(); read by:
-//   - the provisioner Start() path — branches on Mode for clone+build
-//     vs pull
-//   - admin_workspace_images.go — skips remote pull in local mode
-//   - imagewatch.Watcher — short-circuits in local mode (no GHCR poll)
-//
-// SaaS-mode .Prefix matches the existing RegistryPrefix() return value;
-// local-mode .Prefix is the synthetic `molecule-local`.
-type RegistrySource struct {
-	Mode   RegistryMode
-	Prefix string
-}
-
-// Resolve inspects the runtime environment and returns the image-source
-// classification. Treats both unset AND empty-string MOLECULE_IMAGE_REGISTRY
-// as "local mode" — an operator who set the var to "" via a misconfigured
-// deploy would otherwise silently get malformed image refs in SaaS-mode;
-// instead they get the local-build path, which fails loudly if the host
-// has no Docker daemon (better blast radius).
-//
-// Mirrors the existing RegistryPrefix() empty-string handling, so the two
-// functions agree on every input.
-func Resolve() RegistrySource {
-	if v := os.Getenv("MOLECULE_IMAGE_REGISTRY"); v != "" {
-		return RegistrySource{Mode: RegistryModeSaaS, Prefix: v}
-	}
-	return RegistrySource{Mode: RegistryModeLocal, Prefix: localImagePrefix}
-}
-
-// IsKnownRuntime reports whether the given runtime name is in the
-// canonical knownRuntimes list. Exposed so the local-build path can
-// refuse to clone arbitrary repo paths supplied via cfg.Runtime —
-// defence-in-depth against a future code path that might let an
-// attacker influence the runtime string before it reaches the build
-// code.
-func IsKnownRuntime(runtime string) bool {
-	for _, r := range knownRuntimes {
-		if r == runtime {
-			return true
-		}
-	}
-	return false
-}
-
-// LocalImagePrefix returns the synthetic registry hostname used by the
-// local-build path. Exposed so handlers that need to branch on "is
-// this a local-built image?" don't have to duplicate the constant.
-func LocalImagePrefix() string { return localImagePrefix }
@@ -1,152 +0,0 @@
-package provisioner
-
-import (
-	"strings"
-	"testing"
-)
-
-// Tests for the new mode-detection surface. The legacy RegistryPrefix()
-// shim is covered by registry_test.go; these tests pin the explicit
-// two-mode discriminated return from Resolve().
-
-// TestResolve_LocalModeWhenRegistryUnset — the OSS-contributor default.
-// Issue #63: with MOLECULE_IMAGE_REGISTRY unset, the provisioner must
-// switch to the local-build path instead of trying to pull from a GHCR
-// org that's been suspended.
-func TestResolve_LocalModeWhenRegistryUnset(t *testing.T) {
-	t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
-	got := Resolve()
-	if got.Mode != RegistryModeLocal {
-		t.Errorf("Mode = %q, want %q (unset registry → local-build)", got.Mode, RegistryModeLocal)
-	}
-	if got.Prefix != localImagePrefix {
-		t.Errorf("Prefix = %q, want %q", got.Prefix, localImagePrefix)
-	}
-}
-
-// TestResolve_SaaSModeWhenRegistrySet — production tenants set the var
-// to their ECR mirror; we must keep producing pull-style image refs.
-func TestResolve_SaaSModeWhenRegistrySet(t *testing.T) {
-	const ecr = "123456789012.dkr.ecr.us-east-2.amazonaws.com/molecule-ai"
-	t.Setenv("MOLECULE_IMAGE_REGISTRY", ecr)
-	got := Resolve()
-	if got.Mode != RegistryModeSaaS {
-		t.Errorf("Mode = %q, want %q (set registry → saas)", got.Mode, RegistryModeSaaS)
-	}
-	if got.Prefix != ecr {
-		t.Errorf("Prefix = %q, want %q", got.Prefix, ecr)
-	}
-}
-
-// TestResolve_EmptyEnvIsLocalMode — operator who set the var to "" via
-// a misconfigured deploy must NOT silently produce malformed image refs;
-// they get the local path which fails loudly if Docker is missing.
-// This contract is the safer-blast-radius half of Issue #63.
-func TestResolve_EmptyEnvIsLocalMode(t *testing.T) {
-	t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
-	if Resolve().Mode != RegistryModeLocal {
-		t.Fatalf("empty MOLECULE_IMAGE_REGISTRY should be local-mode, got %q", Resolve().Mode)
-	}
-}
-
-// TestResolve_GarbageURL — a registry value that's syntactically malformed
-// (e.g. `not-a-url`, `foo bar`) is still treated as SaaS-mode. The whole
-// design of MOLECULE_IMAGE_REGISTRY is "operator-supplied trusted value";
-// validating the URL here would be pretending we can prevent operator
-// error. The downstream docker-pull will fail loudly with a registry-
-// shaped error message, which is the right blast radius.
-func TestResolve_GarbageURLStillSaaSMode(t *testing.T) {
-	for _, garbage := range []string{
-		"not-a-url",
-		"http://",
-		"ghcr.io/",
-		"   ",
-		"\thello\n",
-	} {
-		t.Run(garbage, func(t *testing.T) {
-			t.Setenv("MOLECULE_IMAGE_REGISTRY", garbage)
-			if Resolve().Mode != RegistryModeSaaS {
-				t.Errorf("Mode = %q, want saas (any non-empty value is SaaS-mode by design)", Resolve().Mode)
-			}
-		})
-	}
-}
-
-// TestRegistryPrefix_AlignedWithResolve — the back-compat shim must
-// agree with Resolve().Prefix on every input the new code distinguishes.
-func TestRegistryPrefix_AlignedWithResolve(t *testing.T) {
-	cases := []struct {
-		name string
-		env  string
-	}{
-		{"unset", ""},
-		{"ecr", "999999999999.dkr.ecr.us-east-2.amazonaws.com/molecule-ai"},
-		{"harbor", "harbor.example.com/molecule"},
-	}
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			t.Setenv("MOLECULE_IMAGE_REGISTRY", tc.env)
-			gotPrefix := RegistryPrefix()
-			gotResolve := Resolve().Prefix
-			// Note: with the new design, RegistryPrefix() unset returns
-			// the SaaS GHCR default (legacy back-compat) while
-			// Resolve().Prefix returns the local-mode "molecule-local"
-			// hostname. They DIVERGE on the unset path by design — that
-			// divergence is what closes the GHCR-403 hole. Pin both so a
-			// future refactor can't accidentally re-couple them.
-			if tc.env == "" {
-				if gotPrefix != defaultRegistryPrefix {
-					t.Errorf("RegistryPrefix() = %q, want %q (legacy shim)", gotPrefix, defaultRegistryPrefix)
-				}
-				if gotResolve != localImagePrefix {
-					t.Errorf("Resolve().Prefix = %q, want %q (local-build hostname)", gotResolve, localImagePrefix)
-				}
-			} else {
-				if gotPrefix != tc.env {
-					t.Errorf("RegistryPrefix() = %q, want %q", gotPrefix, tc.env)
-				}
-				if gotResolve != tc.env {
-					t.Errorf("Resolve().Prefix = %q, want %q", gotResolve, tc.env)
-				}
-			}
-		})
-	}
-}
-
-// TestIsKnownRuntime — defence-in-depth guard for the local-build path.
-// Must accept every entry in knownRuntimes and reject anything else.
-func TestIsKnownRuntime(t *testing.T) {
-	for _, rt := range knownRuntimes {
-		if !IsKnownRuntime(rt) {
-			t.Errorf("IsKnownRuntime(%q) = false, want true", rt)
-		}
-	}
-	for _, bad := range []string{
-		"", "unknown", "WORKSPACE-TEMPLATE-FAKE", "../../../etc/passwd",
-		"langgraph;rm -rf /", "claude-code\n", " langgraph",
-	} {
-		if IsKnownRuntime(bad) {
-			t.Errorf("IsKnownRuntime(%q) = true, want false (untrusted input)", bad)
-		}
-	}
-}
-
-// TestLocalImagePrefix_Stable — the synthetic prefix is part of the
-// public surface; admin handlers and image-watch use it to short-circuit
-// network calls. Pin the constant.
-func TestLocalImagePrefix_Stable(t *testing.T) {
-	if got := LocalImagePrefix(); got != "molecule-local" {
-		t.Errorf("LocalImagePrefix() = %q, want %q", got, "molecule-local")
-	}
-}
-
-// TestLocalImagePrefix_NoDots — the synthetic hostname must not contain
-// a `.` because Docker's image-ref parser would interpret it as a real
-// DNS-resolvable registry. With no dot, the daemon treats `molecule-local`
-// as the registry hostname only when explicitly tagged that way locally,
-// and never tries to resolve it via DNS for a pull.
-func TestLocalImagePrefix_NoDots(t *testing.T) {
-	if strings.Contains(LocalImagePrefix(), ".") {
-		t.Errorf("LocalImagePrefix() = %q contains '.' — Docker would attempt DNS resolution", LocalImagePrefix())
-	}
-}
--- a/Show More
+++ b/Show More