Merge main into fix/add-missing-provisioner-unit-tests + resolve conflict (keep both TestMigrateVolumeIfNeeded_ExistingTruncatedVolume and TestInternalURL/TestApplyTierResources)

Merge pull request 'fix(chat): client timeout is not "unreachable" — keep thinking state for long agent turns' (#2515 ) from fix/chat-timeout-not-unreachable into main
Merge pull request 'fix(memories): upsert namespace before HTTP commit — fleet-wide memory-write outage' (#2517 ) from fix/memories-http-upsert-namespace into main
2026-06-10 11:32:32 +00:00 · 2026-06-10 08:38:32 +00:00 · 2026-06-10 08:38:15 +00:00 · 2026-06-10 07:59:31 +00:00 · 2026-06-10 00:43:13 -07:00 · 2026-06-10 07:42:23 +00:00
48 changed files with 3117 additions and 195 deletions
@@ -24,6 +24,17 @@ Three failure classes:
  F3  (B) and (C) are not set-equal. Audit env wider than protection
      → audit flags non-force-merges as force; narrower → real
      force-merges are missed.
+  F4  Context in (B) is emitted by NO workflow in .gitea/workflows/ at
+      all (repo-wide, case-correct generalization of F2, which only
+      covers `ci / `-prefixed names). This is the inverse-of-F2 hole and
+      the one that makes the `CI / all-required` aggregator's
+      name-vs-coverage gap safe: `all-required` is fail-closed over CI's
+      OWN jobs but CANNOT cover sibling required workflows
+      (`E2E API Smoke Test`, `Handlers Postgres Integration` — Gitea has
+      no cross-workflow `needs:`). F4 verifies each cross-workflow
+      required context still has a live emitter, so a renamed/deleted
+      sibling workflow that BP still requires is caught instead of
+      degrading to a silent absent-as-pending advisory gate.

 Idempotency:
  Searches OPEN issues by exact title prefix
@@ -380,6 +391,68 @@ def expected_context(job_key: str, workflow_name: str = "ci") -> str:
    return f"{workflow_name} / {job_key} (pull_request)"


+def workflow_emitted_contexts(wf_doc: dict) -> set[str]:
+    """The set of `pull_request` status-check contexts a SINGLE workflow
+    emits, computed from its real `name:` + each job's `name or key`.
+
+    Gitea reports a context as `{workflow.name} / {job.name|job.key}
+    (pull_request)`. Unlike `expected_context()` (which hard-codes the
+    lowercase literal `ci` and the bare job-KEY — a shape that does NOT
+    match this repo, whose workflow is `name: CI` and whose CI jobs DO
+    set per-job `name:`), this reads the authoritative names straight
+    from the parsed YAML, so the contexts it produces are byte-equal to
+    what BP records. Used by F4 (cross-workflow emitter existence).
+
+    Jobs whose `if:` gates on `github.event_name`/`github.ref` are still
+    emitters on the events they DO run — they remain in the set; F4 only
+    asserts *existence of an emitter*, never that it ran on a given
+    trigger."""
+    name = wf_doc.get("name")
+    if not isinstance(name, str) or not name:
+        return set()
+    jobs = wf_doc.get("jobs")
+    if not isinstance(jobs, dict):
+        return set()
+    out: set[str] = set()
+    for key, spec in jobs.items():
+        job_name = key
+        if isinstance(spec, dict) and isinstance(spec.get("name"), str) and spec["name"]:
+            job_name = spec["name"]
+        out.add(f"{name} / {job_name} (pull_request)")
+    return out
+
+
+def all_emitted_contexts(workflows_dir: str = ".gitea/workflows") -> set[str]:
+    """Union of `pull_request` contexts emitted by EVERY workflow in the
+    repo. F4 uses this to assert that each BP-required
+    `status_check_contexts` entry corresponds to a real emitting
+    workflow+job — closing the inverse-of-F2 hole where BP requires a
+    context that NO workflow produces (e.g. a sibling workflow like
+    `E2E API Smoke Test` or `Handlers Postgres Integration` was renamed
+    or deleted while still required, leaving BP demanding a green it can
+    never receive; Gitea treats absent-as-pending → silent advisory
+    gate). This is what makes the misleadingly-named `CI / all-required`
+    aggregator safe at the repo level: it only covers CI's own jobs, but
+    F4 guarantees the cross-workflow required contexts it CANNOT cover
+    are real and present."""
+    import glob as _glob
+
+    emitted: set[str] = set()
+    for path in sorted(_glob.glob(os.path.join(workflows_dir, "*.yml"))):
+        try:
+            with open(path, encoding="utf-8") as f:
+                doc = yaml.safe_load(f)
+        except (OSError, yaml.YAMLError):
+            # A single unparseable sibling workflow must not blind F4 to
+            # the rest. Skip it loudly; lint-workflow-yaml gates parse
+            # validity separately.
+            sys.stderr.write(f"::warning::F4: could not parse {path}, skipping\n")
+            continue
+        if isinstance(doc, dict):
+            emitted |= workflow_emitted_contexts(doc)
+    return emitted
+
+
 # --------------------------------------------------------------------------
 # Drift detection
 # --------------------------------------------------------------------------
@@ -531,6 +604,36 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
            + "\n".join(f"  - {c}" for c in stale_protection)
        )

+    # ----- F4: cross-workflow required context has no emitting workflow -----
+    # F2 (above) is scoped to `ci / `-prefixed contexts ONLY, and built
+    # from the hard-coded lowercase literal `ci` + bare job-keys — a shape
+    # that does NOT match this repo (workflow is `name: CI`, jobs set their
+    # own `name:`), so F2 is effectively dormant here. F4 is the
+    # case-correct, REPO-WIDE generalization: it parses every workflow's
+    # real `name:` + job `name|key` and asserts that EVERY BP-required
+    # context is actually emitted by some workflow.
+    #
+    # This is the gate that makes the `CI / all-required` aggregator's
+    # name-vs-coverage gap safe. `all-required` is fail-closed over CI's
+    # OWN jobs but — by Gitea's design (no cross-workflow `needs:`) — it
+    # CANNOT and does not cover sibling required workflows
+    # (`E2E API Smoke Test`, `Handlers Postgres Integration`). Those MUST
+    # be listed in BP independently. F4 verifies each such BP context
+    # still has a live emitter, so the inverse-of-F2 hole — BP requires a
+    # context that no workflow produces (rename/delete a sibling workflow
+    # while still required → Gitea treats absent-as-pending → silent
+    # advisory gate, and a red PR can look mergeable) — is caught.
+    repo_emitted = all_emitted_contexts(os.path.dirname(CI_WORKFLOW_PATH))
+    unemitted = sorted(c for c in contexts if c not in repo_emitted)
+    if unemitted:
+        findings.append(
+            "F4 — branch_protections/{br}.status_check_contexts entries that "
+            "NO workflow in .gitea/workflows/ emits "
+            "(stale required name → silent advisory gate; a red PR can look "
+            "mergeable):\n".format(br=branch)
+            + "\n".join(f"  - {c}" for c in unemitted)
+        )
+
    # ----- F3: audit env vs protection contexts (set-equal) -----
    only_in_env = sorted(env_set - contexts)
    only_in_protection = sorted(contexts - env_set)
@@ -556,6 +659,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
        "protection_contexts": sorted(contexts),
        "audit_env_checks": sorted(env_set),
        "expected_contexts": sorted(emitted_contexts),
+        "repo_emitted_contexts": sorted(repo_emitted),
    }
    return findings, debug

@@ -210,6 +210,60 @@ REQUIRED_APPROVALS_DEFAULT = int(_env("REQUIRED_APPROVALS", default="2") or "2")
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""

+# --------------------------------------------------------------------------
+# Conductor snapshot (operator-config#158)
+# --------------------------------------------------------------------------
+# When the conductor tick writes a state snapshot before running the passes,
+# both scripts see the SAME observed state instead of re-fetching independently
+# and potentially disagreeing within the same tick.
+# --------------------------------------------------------------------------
+
+
+def load_conductor_snapshot() -> dict | None:
+    """Load the conductor snapshot if present and fresh.
+
+    The snapshot is written by the conductor wrapper
+    (bin/molecule-core-cron-bot.sh conductor) to a path exported as
+    CONDUCTOR_SNAPSHOT_FILE. It contains open PRs + per-head combined
+    statuses + reviews captured in a single state-read.
+
+    Returns the parsed snapshot dict, or None if absent, unreadable,
+    or older than the freshness threshold (10 minutes — twice the */5
+    conductor cadence, so a single skipped tick does not invalidate it).
+    """
+    path = os.environ.get("CONDUCTOR_SNAPSHOT_FILE", "")
+    if not path:
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            snapshot = json.load(f)
+    except (OSError, json.JSONDecodeError) as exc:
+        print(f"::notice::conductor snapshot unreadable ({exc}); self-fetching")
+        return None
+
+    if not isinstance(snapshot, dict):
+        return None
+
+    ts_str = snapshot.get("ts", "")
+    if ts_str:
+        try:
+            from datetime import datetime, timezone
+
+            ts = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%SZ").replace(
+                tzinfo=timezone.utc
+            )
+            age_sec = (datetime.now(timezone.utc) - ts).total_seconds()
+            if age_sec > 600:  # 10 minutes
+                print(
+                    f"::notice::conductor snapshot stale ({int(age_sec)}s); "
+                    "self-fetching"
+                )
+                return None
+        except ValueError:
+            pass  # malformed ts, treat as fresh (conservative)
+
+    return snapshot
+

 class ApiError(RuntimeError):
    pass
@@ -711,19 +765,36 @@ def get_branch_head(branch: str) -> str:
    return sha


+def _snapshot_status_for_sha(sha: str) -> dict | None:
+    """Return a Gitea-shaped combined-status dict from the conductor snapshot
+    if the SHA matches an open PR head, else None."""
+    snapshot = load_conductor_snapshot()
+    if snapshot is None:
+        return None
+    for pr in (snapshot.get("prs") or []):
+        if pr.get("head_sha") == sha:
+            statuses = pr.get("statuses") or []
+            return {
+                "state": pr.get("combined_state", "unknown"),
+                "statuses": [
+                    {"context": s.get("context"), "status": s.get("status")}
+                    for s in statuses
+                    if isinstance(s, dict)
+                ],
+            }
+    return None
+
+
 def get_combined_status(sha: str) -> dict:
    """Combined status + all individual statuses for `sha`.

-    The /status endpoint caps the `statuses` array at 30 entries (Gitea
-    default page size), so we fetch the full list via /statuses. The combined
-    `state` still comes from /status.
-
-    Fail-closed: BOTH the PRIMARY /status fetch AND the SECONDARY /statuses
-    enrichment must succeed. If either raises, the error propagates so the
-    caller skips this PR this tick (we never treat a failed status fetch as
-    green — dev-sop "no fail-open"). A paginated /statuses error must NOT
-    silently degrade to an incomplete status set.
+    Uses the conductor snapshot when available (same tick, same observed
+    state as the merge-queue pass), otherwise self-fetches via API.
    """
+    snapshot_status = _snapshot_status_for_sha(sha)
+    if snapshot_status is not None:
+        return snapshot_status
+
    _, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(combined, dict):
        raise ApiError(f"status for {sha} response not object")
@@ -747,7 +818,27 @@ def get_combined_status(sha: str) -> dict:
    return combined


+def _snapshot_pr_to_issue(pr_entry: dict) -> dict:
+    """Normalise a conductor-snapshot PR entry into the shape the queue
+    expects from /issues (number, title, labels, pull_request sub-dict)."""
+    return {
+        "number": pr_entry.get("number"),
+        "title": pr_entry.get("title"),
+        "labels": [{"name": n} for n in (pr_entry.get("labels") or [])],
+        "pull_request": {"draft": False},
+        "created_at": "",
+    }
+
+
 def list_queued_issues() -> list[dict]:
+    snapshot = load_conductor_snapshot()
+    if snapshot is not None:
+        prs = snapshot.get("prs") or []
+        return [
+            _snapshot_pr_to_issue(p)
+            for p in prs
+            if QUEUE_LABEL in (p.get("labels") or [])
+        ]
    return api_paginated(
        "GET",
        f"/repos/{OWNER}/{NAME}/issues",
@@ -768,6 +859,10 @@ def list_candidate_issues(*, auto_discover: bool) -> list[dict]:
    back to the legacy label-filtered listing (opt-IN). Opt-out filtering and
    draft-skipping happen in choose_next_candidate_issue, not here.
    """
+    snapshot = load_conductor_snapshot()
+    if snapshot is not None:
+        prs = snapshot.get("prs") or []
+        return [_snapshot_pr_to_issue(p) for p in prs]
    if not auto_discover:
        return list_queued_issues()
    return api_paginated(
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Drift-prevention guard: SEV #2499 class (KI-013 container/volume naming).
+#
+# KI-013 removed 12-char UUID truncation from container/volume names.
+# E2E scripts must use FULL workspace IDs when referencing containers
+# and volumes. Any :0:12 substring-match truncation is a regression risk.
+#
+# Scans ALL .sh files under tests/e2e/ (including lib/ and subdirs).
+# Run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh
+set -euo pipefail
+
+PAT=':0:12([^0-9]|$)'
+ERR=0
+
+# Use find to recurse into tests/e2e subdirs (lib/, cron/, etc.)
+while IFS= read -r -d '' f; do
+  MATCHES=$(grep -nE "$PAT" "$f" 2>/dev/null || true)
+  if [ -n "$MATCHES" ]; then
+    echo "::error::SEV-2499 drift guard: truncated workspace ID (:0:12) in E2E script"
+    echo "::error::file=$f"
+    echo "$MATCHES" | while read -r line; do
+      echo "::error::  $line"
+    done
+    ERR=1
+  fi
+done < <(find tests/e2e -type f -name '*.sh' -print0)
+
+if [ "$ERR" -ne 0 ]; then
+  echo ""
+  echo "FAIL: E2E scripts use 12-char truncated IDs (:0:12)."
+  echo "      KI-013 requires FULL workspace IDs. Update the flagged lines."
+  exit 1
+fi
+
+echo "PASS: No truncated workspace IDs in E2E scripts."
@@ -66,6 +66,14 @@ def build_plan(env: dict[str, str]) -> dict:
        "target_tag": target_tag,
        "soak_seconds": _int_env(env, "PROD_AUTO_DEPLOY_SOAK_SECONDS", 60, minimum=0),
        "batch_size": _int_env(env, "PROD_AUTO_DEPLOY_BATCH_SIZE", 3),
+        # Tolerate a small minority of individually-stuck tenants (e.g. a wedged
+        # data volume that won't recreate). They are QUARANTINED — shipped past
+        # so the healthy majority still lands the build — and reported for
+        # separate recovery, instead of one stuck tenant blocking the whole
+        # fleet deploy. The canary still must pass, the CP halts a batch the
+        # moment failures exceed this, and the cross-batch coverage gate below
+        # enforces the same tolerance globally. Default 1.
+        "max_stragglers": _int_env(env, "PROD_AUTO_DEPLOY_MAX_STRAGGLERS", 1, minimum=0),
        "dry_run": truthy_flag(env.get("PROD_AUTO_DEPLOY_DRY_RUN", "")),
        # confirm:true ack required by CP /cp/admin/tenants/redeploy-fleet
        # contract (cp#228 / task #308) for fleet-wide intent. Empty body
@@ -251,26 +259,41 @@ def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]:
    return sorted(s for s in dict.fromkeys(enumerated) if s not in verified)


-def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None:
-    """Fail the rollout if any enumerated tenant is not on the target build.
+def assert_full_coverage(
+    enumerated: list[str], aggregate: dict, dry_run: bool, max_stragglers: int = 0
+) -> None:
+    """Gate the rollout on coverage, tolerating a quarantined straggler minority.

-    This is the no-silent-skip gate (internal#724). A dry run proves
-    nothing landed, so coverage is not asserted for it.
+    This is the no-silent-skip gate (internal#724) made resilient: every
+    enumerated tenant must be PROVEN on the target build, EXCEPT up to
+    ``max_stragglers`` individually-stuck tenants which are quarantined (shipped
+    past) and reported for separate recovery instead of blocking the whole
+    fleet deploy. Exceeding the tolerance is a systemic failure → RolloutFailed.
+    A dry run proves nothing landed, so coverage is not asserted for it.
    """

    if dry_run:
        return
    stragglers = rollout_stragglers(enumerated, aggregate.get("results") or [])
-    if stragglers:
+    if not stragglers:
+        return
+    # Surface the stragglers (for the step summary + recovery), gate or not.
+    aggregate["stragglers"] = stragglers
+    if len(stragglers) > max_stragglers:
        msg = (
            f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target "
-            f"after redeploy-fleet: {', '.join(stragglers)} "
+            f"after redeploy-fleet (max tolerated {max_stragglers}): {', '.join(stragglers)} "
            f"(enumerated {len(set(enumerated))})"
        )
        aggregate["ok"] = False
        aggregate["error"] = msg
-        aggregate["stragglers"] = stragglers
        raise RolloutFailed(msg, aggregate)
+    # Within tolerance: shipped to the healthy majority; quarantine is loud,
+    # not fatal. The deploy succeeds; the stragglers need individual recovery.
+    print(
+        f"::warning::quarantined {len(stragglers)} straggler(s) (<= max {max_stragglers}); "
+        f"shipped to the rest of the fleet — these need recovery: {', '.join(stragglers)}"
+    )


 def execute_scoped_rollout(
@@ -325,7 +348,8 @@ def execute_scoped_rollout(
    # or one enumerated but never batched, is a straggler. Surfacing it as
    # a RolloutFailed makes the deploy step exit non-zero instead of
    # silently reporting success (the exact agents-team failure mode).
-    assert_full_coverage(all_slugs, aggregate, dry_run)
+    max_stragglers = int(base_body.get("max_stragglers") or 0)
+    assert_full_coverage(all_slugs, aggregate, dry_run, max_stragglers)

    return aggregate

@@ -351,7 +351,8 @@ def compute_ack_state(
            latest_directive[(user, slug)] = kind

    # Step 2: build candidate ackers per slug.
-    # Filter out self-acks and unknown slugs.
+    # Filter out self-acks and unknown slugs. Author self-ack is forbidden
+    # per .gitea/sop-checklist-config.yaml — a non-author peer must ack.
    ackers_per_slug: dict[str, list[str]] = {s: [] for s in items_by_slug}
    rejected_self: dict[str, list[str]] = {s: [] for s in items_by_slug}
    pending_team_check: dict[str, list[str]] = {s: [] for s in items_by_slug}
@@ -154,6 +154,54 @@ CANCELLED_DESCRIPTION = "Has been cancelled"
 PUSH_SUFFIX = " (push)"
 PULL_REQUEST_SUFFIX = " (pull_request)"

+# --------------------------------------------------------------------------
+# Conductor snapshot (operator-config#158)
+# --------------------------------------------------------------------------
+# When the conductor tick writes a state snapshot before running the passes,
+# both scripts see the SAME observed state instead of re-fetching independently
+# and potentially disagreeing within the same tick.
+# --------------------------------------------------------------------------
+
+
+def load_conductor_snapshot() -> dict | None:
+    """Load the conductor snapshot if present and fresh.
+
+    Returns the parsed snapshot dict, or None if absent, unreadable,
+    or older than the freshness threshold (10 minutes).
+    """
+    path = os.environ.get("CONDUCTOR_SNAPSHOT_FILE", "")
+    if not path:
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            snapshot = json.load(f)
+    except (OSError, json.JSONDecodeError) as exc:
+        print(f"::notice::conductor snapshot unreadable ({exc}); self-fetching")
+        return None
+
+    if not isinstance(snapshot, dict):
+        return None
+
+    ts_str = snapshot.get("ts", "")
+    if ts_str:
+        try:
+            from datetime import datetime, timezone
+
+            ts = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%SZ").replace(
+                tzinfo=timezone.utc
+            )
+            age_sec = (datetime.now(timezone.utc) - ts).total_seconds()
+            if age_sec > 600:  # 10 minutes
+                print(
+                    f"::notice::conductor snapshot stale ({int(age_sec)}s); "
+                    "self-fetching"
+                )
+                return None
+        except ValueError:
+            pass
+
+    return snapshot
+

 def _require_runtime_env() -> None:
    """Enforce env contract — called from `main()` only.
@@ -382,8 +430,23 @@ def get_combined_status(sha: str) -> dict:
          ],
          ...
        }
+    Uses the conductor snapshot when the SHA matches an open PR head,
+    otherwise self-fetches via API.
    Raises ApiError on non-2xx.
    """
+    snapshot = load_conductor_snapshot()
+    if snapshot is not None:
+        for pr in (snapshot.get("prs") or []):
+            if pr.get("head_sha") == sha:
+                statuses = pr.get("statuses") or []
+                return {
+                    "state": pr.get("combined_state", "unknown"),
+                    "statuses": [
+                        {"context": s.get("context"), "state": s.get("status")}
+                        for s in statuses
+                        if isinstance(s, dict)
+                    ],
+                }
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(body, dict):
        raise ApiError(f"status for {sha} response not a JSON object")
@@ -275,3 +275,125 @@ def test_detect_drift_no_f1_when_needs_empty_even_with_jobs():
            findings, _ = drift.detect_drift("main")

    assert not any("F1 —" in f for f in findings)
+
+
+# ---------------------------------------------------------------------------
+# F4 — cross-workflow required-context emitter existence
+# (closes the `CI / all-required` name-vs-coverage hole: the sentinel is
+#  fail-closed over CI's own jobs but CANNOT cover sibling required
+#  workflows — Gitea has no cross-workflow `needs:` — so F4 guarantees each
+#  BP-required context still has a live emitting workflow.)
+# ---------------------------------------------------------------------------
+
+def test_workflow_emitted_contexts_uses_job_name_over_key():
+    """Job `name:` wins over key; missing name falls back to key."""
+    doc = {
+        "name": "E2E API Smoke Test",
+        "jobs": {
+            "detect-changes": {},  # no name -> key
+            "e2e-api": {"name": "E2E API Smoke Test"},
+        },
+    }
+    got = drift.workflow_emitted_contexts(doc)
+    assert got == {
+        "E2E API Smoke Test / detect-changes (pull_request)",
+        "E2E API Smoke Test / E2E API Smoke Test (pull_request)",
+    }
+
+
+def test_workflow_emitted_contexts_empty_when_no_name():
+    """A workflow with no top-level `name:` emits nothing F4 can match."""
+    assert drift.workflow_emitted_contexts({"jobs": {"x": {}}}) == set()
+
+
+def test_all_emitted_contexts_unions_workflow_dir(tmp_path):
+    """all_emitted_contexts globs *.yml and unions their emitter sets."""
+    wf = tmp_path / "wf"
+    wf.mkdir()
+    (wf / "a.yml").write_text(
+        "name: CI\njobs:\n  all-required:\n    runs-on: x\n", encoding="utf-8"
+    )
+    (wf / "b.yml").write_text(
+        "name: Handlers Postgres Integration\n"
+        "jobs:\n  integration:\n    name: Handlers Postgres Integration\n"
+        "    runs-on: x\n",
+        encoding="utf-8",
+    )
+    got = drift.all_emitted_contexts(str(wf))
+    assert "CI / all-required (pull_request)" in got
+    assert "Handlers Postgres Integration / Handlers Postgres Integration (pull_request)" in got
+
+
+def test_all_emitted_contexts_skips_unparseable(tmp_path):
+    """A single broken sibling workflow must not blind F4 to the rest."""
+    wf = tmp_path / "wf"
+    wf.mkdir()
+    (wf / "good.yml").write_text("name: CI\njobs:\n  j:\n    runs-on: x\n", encoding="utf-8")
+    (wf / "bad.yml").write_text("name: [unterminated\n  : : :\n", encoding="utf-8")
+    got = drift.all_emitted_contexts(str(wf))
+    assert "CI / j (pull_request)" in got
+
+
+# A BP fixture that includes the two cross-workflow required contexts.
+_BP_WITH_SIBLINGS = {
+    "status_check_contexts": [
+        "CI / all-required (pull_request)",
+        "E2E API Smoke Test / E2E API Smoke Test (pull_request)",
+        "Handlers Postgres Integration / Handlers Postgres Integration (pull_request)",
+    ]
+}
+
+# The matching set of repo-wide emitted contexts (what a correct repo produces).
+_EMITTED_OK = {
+    "CI / all-required (pull_request)",
+    "E2E API Smoke Test / E2E API Smoke Test (pull_request)",
+    "Handlers Postgres Integration / Handlers Postgres Integration (pull_request)",
+}
+
+
+def test_detect_drift_f4_silent_when_all_contexts_emitted():
+    """No F4 when every BP context has a live emitting workflow."""
+    ci = _make_ci_doc({"all-required": {}})
+    audit = _make_audit_doc(sorted(_BP_WITH_SIBLINGS["status_check_contexts"]))
+    with patch.object(drift, "load_yaml", side_effect=[ci, audit]):
+        with patch.object(drift, "api", return_value=(200, _BP_WITH_SIBLINGS)):
+            with patch.object(drift, "all_emitted_contexts", return_value=set(_EMITTED_OK)):
+                findings, debug = drift.detect_drift("main")
+    assert not any("F4 —" in f for f in findings)
+    assert debug["repo_emitted_contexts"] == sorted(_EMITTED_OK)
+
+
+def test_detect_drift_f4_fires_on_stale_cross_workflow_context():
+    """The core gate-hole regression: BP requires a cross-workflow context
+    (e.g. a renamed/deleted sibling workflow) that NO workflow emits.
+    F4 must fire — this is the inverse-of-F2 hole that makes a red PR look
+    mergeable if BP is ever trimmed/renamed around `CI / all-required`."""
+    ci = _make_ci_doc({"all-required": {}})
+    audit = _make_audit_doc(sorted(_BP_WITH_SIBLINGS["status_check_contexts"]))
+    # Handlers workflow got renamed -> its OLD BP context now has no emitter.
+    emitted_after_rename = {
+        "CI / all-required (pull_request)",
+        "E2E API Smoke Test / E2E API Smoke Test (pull_request)",
+        # Handlers context absent (renamed away)
+    }
+    with patch.object(drift, "load_yaml", side_effect=[ci, audit]):
+        with patch.object(drift, "api", return_value=(200, _BP_WITH_SIBLINGS)):
+            with patch.object(drift, "all_emitted_contexts", return_value=emitted_after_rename):
+                findings, _ = drift.detect_drift("main")
+    assert any("F4 —" in f for f in findings)
+    assert any("Handlers Postgres Integration" in f for f in findings)
+
+
+def test_detect_drift_f4_catches_all_required_only_trim():
+    """If BP is trimmed to JUST `CI / all-required` but E2E/Handlers are
+    still real workflows, F4 does NOT fire (no stale context) — but F3b
+    (env vs BP) / operator policy must keep them required. This asserts F4
+    does not false-positive on a correctly-emitted lone context."""
+    bp = {"status_check_contexts": ["CI / all-required (pull_request)"]}
+    ci = _make_ci_doc({"all-required": {}})
+    audit = _make_audit_doc(["CI / all-required (pull_request)"])
+    with patch.object(drift, "load_yaml", side_effect=[ci, audit]):
+        with patch.object(drift, "api", return_value=(200, bp)):
+            with patch.object(drift, "all_emitted_contexts", return_value=set(_EMITTED_OK)):
+                findings, _ = drift.detect_drift("main")
+    assert not any("F4 —" in f for f in findings)
@@ -1,4 +1,4 @@
-import importlib.util
+import importlib
 import sys
 from pathlib import Path

@@ -1778,3 +1778,129 @@ def test_print_post_batch_summary_counts_correctly(capsys):
    assert "PR #1: state=ready" in out
    assert "PR #2: state=waiting" in out
    assert "PR #3: state=ineligible" in out
+
+
+# ---------------------------------------------------------------------------
+# Conductor snapshot consumption (operator-config#158 / molecule-core#2502)
+# ---------------------------------------------------------------------------
+
+import json
+import os
+import tempfile
+
+
+def _make_snapshot(prs, ts="2026-06-10T12:00:00Z"):
+    return {"ts": ts, "repo": "molecule-ai/molecule-core", "prs": prs}
+
+
+def test_list_candidate_issues_uses_snapshot_when_present(monkeypatch):
+    """When CONDUCTOR_SNAPSHOT_FILE is present and fresh, list_candidate_issues
+    returns the snapshot PRs instead of hitting the API."""
+    snapshot = _make_snapshot([
+        {"number": 10, "title": "PR 10", "head_sha": "a" * 40,
+         "labels": ["merge-queue"],
+         "combined_state": "success", "statuses": []},
+        {"number": 20, "title": "PR 20", "head_sha": "b" * 40,
+         "labels": [],
+         "combined_state": "success", "statuses": []},
+    ])
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(snapshot, f)
+        path = f.name
+    try:
+        monkeypatch.setenv("CONDUCTOR_SNAPSHOT_FILE", path)
+        # reload so load_conductor_snapshot sees the env var
+        candidates = mq.list_candidate_issues(auto_discover=True)
+        assert len(candidates) == 2
+        assert [c["number"] for c in candidates] == [10, 20]
+    finally:
+        os.unlink(path)
+
+
+def test_list_queued_issues_uses_snapshot_label_filter(monkeypatch):
+    """list_queued_issues (opt-IN mode) filters the snapshot by QUEUE_LABEL."""
+    snapshot = _make_snapshot([
+        {"number": 11, "title": "Labeled", "head_sha": "a" * 40,
+         "labels": ["merge-queue"], "combined_state": "success", "statuses": []},
+        {"number": 22, "title": "Unlabeled", "head_sha": "b" * 40,
+         "labels": [], "combined_state": "success", "statuses": []},
+    ])
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(snapshot, f)
+        path = f.name
+    try:
+        monkeypatch.setenv("CONDUCTOR_SNAPSHOT_FILE", path)
+        monkeypatch.setattr(mq, "QUEUE_LABEL", "merge-queue")
+        queued = mq.list_queued_issues()
+        assert len(queued) == 1
+        assert queued[0]["number"] == 11
+    finally:
+        os.unlink(path)
+
+
+def test_get_combined_status_uses_snapshot_when_sha_matches(monkeypatch):
+    """get_combined_status returns snapshot data when the SHA is an open PR head."""
+    head_sha = "c" * 40
+    snapshot = _make_snapshot([
+        {"number": 30, "title": "PR 30", "head_sha": head_sha,
+         "labels": [],
+         "combined_state": "failure",
+         "statuses": [
+             {"context": "CI / all-required (pull_request)", "status": "failure"},
+         ]},
+    ])
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(snapshot, f)
+        path = f.name
+    try:
+        monkeypatch.setenv("CONDUCTOR_SNAPSHOT_FILE", path)
+        combined = mq.get_combined_status(head_sha)
+        assert combined["state"] == "failure"
+        assert len(combined["statuses"]) == 1
+        assert combined["statuses"][0]["context"] == "CI / all-required (pull_request)"
+        assert combined["statuses"][0]["status"] == "failure"
+    finally:
+        os.unlink(path)
+
+
+def test_get_combined_status_self_fetches_when_sha_not_in_snapshot(monkeypatch):
+    """If the SHA is not in the snapshot, get_combined_status falls back to API."""
+    snapshot = _make_snapshot([
+        {"number": 40, "head_sha": "d" * 40, "labels": [],
+         "combined_state": "success", "statuses": []},
+    ])
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(snapshot, f)
+        path = f.name
+    try:
+        monkeypatch.setenv("CONDUCTOR_SNAPSHOT_FILE", path)
+        monkeypatch.setattr(mq, "OWNER", "o")
+        monkeypatch.setattr(mq, "NAME", "r")
+
+        def fake_api(method, path, **kw):
+            if path.endswith("/status"):
+                return 200, {"state": "success", "statuses": [{"context": "c1", "status": "success"}]}
+            if path.endswith("/statuses"):
+                return 200, []
+            raise mq.ApiError("unexpected")
+
+        monkeypatch.setattr(mq, "api", fake_api)
+        combined = mq.get_combined_status("e" * 40)
+        assert combined["state"] == "success"
+    finally:
+        os.unlink(path)
+
+
+def test_load_conductor_snapshot_ignores_stale_snapshot(monkeypatch):
+    """A snapshot older than 10 minutes is treated as absent (self-fetch)."""
+    from datetime import datetime, timezone, timedelta
+    old_ts = (datetime.now(timezone.utc) - timedelta(minutes=15)).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot = _make_snapshot([{"number": 50, "head_sha": "f" * 40, "labels": []}], ts=old_ts)
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(snapshot, f)
+        path = f.name
+    try:
+        monkeypatch.setenv("CONDUCTOR_SNAPSHOT_FILE", path)
+        assert mq.load_conductor_snapshot() is None
+    finally:
+        os.unlink(path)
@@ -35,6 +35,9 @@ def test_build_plan_defaults_to_staging_sha_target_and_prod_cp():
        "canary_slug": "hongming",
        "soak_seconds": 60,
        "batch_size": 3,
+        # quarantine up to 1 individually-stuck tenant rather than blocking the
+        # whole fleet deploy (default).
+        "max_stragglers": 1,
        "dry_run": False,
        # cp#228 / task #308: fleet-wide intent must carry confirm:true.
        "confirm": True,
@@ -470,6 +473,72 @@ def test_scoped_rollout_passes_when_all_tenants_verified_on_target():
    assert "stragglers" not in aggregate


+def test_scoped_rollout_quarantines_straggler_within_tolerance():
+    # reno-stars never verifies on target; max_stragglers=1 tolerates it — the
+    # rollout still succeeds (ships to the healthy majority) and reports the
+    # quarantined straggler instead of failing the whole deploy.
+    def fake_redeploy(_cp_url, _token, body):
+        return 200, {
+            "ok": True,
+            "results": [
+                {"slug": s, "verified_on_target": (s != "reno-stars")}
+                for s in body["only_slugs"]
+            ],
+        }
+
+    aggregate = prod.execute_scoped_rollout(
+        {
+            "cp_url": "https://api.moleculesai.app",
+            "body": {
+                "target_tag": "staging-new",
+                "batch_size": 5,
+                "dry_run": False,
+                "confirm": True,
+                "max_stragglers": 1,
+            },
+        },
+        token="secret",
+        list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
+        redeploy=fake_redeploy,
+        sleep=lambda _s: None,
+    )
+    assert aggregate["ok"] is True
+    assert aggregate["stragglers"] == ["reno-stars"]
+
+
+def test_scoped_rollout_fails_when_stragglers_exceed_tolerance():
+    # Two tenants never verify; with max_stragglers=1 that is systemic → fail.
+    def fake_redeploy(_cp_url, _token, body):
+        return 200, {
+            "ok": True,
+            "results": [
+                {"slug": s, "verified_on_target": (s == "hongming")}
+                for s in body["only_slugs"]
+            ],
+        }
+
+    try:
+        prod.execute_scoped_rollout(
+            {
+                "cp_url": "https://api.moleculesai.app",
+                "body": {
+                    "target_tag": "staging-new",
+                    "batch_size": 5,
+                    "dry_run": False,
+                    "confirm": True,
+                    "max_stragglers": 1,
+                },
+            },
+            token="secret",
+            list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
+            redeploy=fake_redeploy,
+            sleep=lambda _s: None,
+        )
+        raise AssertionError("expected RolloutFailed when stragglers exceed tolerance")
+    except prod.RolloutFailed as exc:
+        assert "max tolerated 1" in str(exc)
+
+
 def test_scoped_rollout_dry_run_does_not_assert_coverage():
    # A dry run proves nothing landed; coverage must NOT be asserted or
    # every plan would fail.
@@ -291,7 +291,8 @@ class TestComputeAckState(unittest.TestCase):
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])

-    def test_self_ack_rejected(self):
+    def test_self_ack_rejected_when_author_in_team(self):
+        # Author self-acks are forbidden — a non-author peer must ack.
        comments = [_comment("alice", "/sop-ack comprehensive-testing")]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
@@ -722,16 +723,16 @@ class TestRootCauseAckEligibilityWidened(unittest.TestCase):
        )
        self.assertEqual(state["root-cause"]["ackers"], ["hongming"])

-    def test_self_ack_still_forbidden_even_with_widened_eligibility(self):
-        # Author cannot self-ack — widening teams must NOT weaken
-        # the non-author rule.
+    def test_self_ack_rejected_with_widened_eligibility(self):
+        # Author self-acks are forbidden even when the author is in the
+        # required team — a non-author peer must ack.
        comments = [_comment("alice", "/sop-ack root-cause")]
        probe = self._approve_only({"alice"})
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, probe, high_risk=False
        )
        self.assertEqual(state["root-cause"]["ackers"], [])
-        self.assertIn("alice", state["root-cause"]["rejected"]["self_ack"])
+        self.assertEqual(state["root-cause"]["rejected"]["self_ack"], ["alice"])


 class TestHighRiskClassUsesElevatedListInConfig(unittest.TestCase):
@@ -167,3 +167,78 @@ def test_reap_preserves_failed_pr_context_without_push_success(monkeypatch):

    assert counters["preserved_pr_without_push_success"] == 2
    assert posted == []
+
+
+# ---------------------------------------------------------------------------
+# Conductor snapshot consumption (operator-config#158 / molecule-core#2502)
+# ---------------------------------------------------------------------------
+
+import os
+import tempfile
+
+
+def test_get_combined_status_uses_snapshot_when_sha_matches(monkeypatch):
+    """When the SHA is an open PR head in the conductor snapshot, get_combined_status
+    returns the snapshot data instead of calling the API."""
+    mod = load_reaper()
+    head_sha = "a" * 40
+    snapshot = {
+        "ts": "2026-06-10T12:00:00Z",
+        "repo": "molecule-ai/molecule-core",
+        "prs": [
+            {
+                "number": 99,
+                "title": "PR 99",
+                "head_sha": head_sha,
+                "labels": [],
+                "combined_state": "failure",
+                "statuses": [
+                    {"context": "CI / Platform (Go) (push)", "status": "failure"},
+                ],
+            }
+        ],
+    }
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(snapshot, f)
+        path = f.name
+    try:
+        monkeypatch.setenv("CONDUCTOR_SNAPSHOT_FILE", path)
+        import importlib
+        mod = load_reaper()  # reload to pick up env var
+        combined = mod.get_combined_status(head_sha)
+        assert combined["state"] == "failure"
+        assert len(combined["statuses"]) == 1
+        assert combined["statuses"][0]["context"] == "CI / Platform (Go) (push)"
+    finally:
+        os.unlink(path)
+
+
+def test_get_combined_status_self_fetches_when_sha_not_in_snapshot(monkeypatch):
+    """If the SHA is not in the snapshot, get_combined_status falls back to API."""
+    mod = load_reaper()
+    snapshot = {
+        "ts": "2026-06-10T12:00:00Z",
+        "repo": "molecule-ai/molecule-core",
+        "prs": [
+            {"number": 1, "head_sha": "b" * 40, "labels": [],
+             "combined_state": "success", "statuses": []},
+        ],
+    }
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(snapshot, f)
+        path = f.name
+    try:
+        monkeypatch.setenv("CONDUCTOR_SNAPSHOT_FILE", path)
+        import importlib
+        mod = load_reaper()
+
+        def fake_api(method, path, **kw):
+            if path.endswith("/status"):
+                return 200, {"state": "success", "statuses": []}
+            raise mod.ApiError("unexpected")
+
+        monkeypatch.setattr(mod, "api", fake_api)
+        combined = mod.get_combined_status("c" * 40)
+        assert combined["state"] == "success"
+    finally:
+        os.unlink(path)
@@ -394,6 +394,14 @@ jobs:
          # a revert of the zero-validated→RED logic goes red on every PR.
          bash tests/e2e/test_require_live_priority_gate_unit.sh

+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
+        name: Drift guard — KI-013 container/volume naming (SEV #2499)
+        # KI-013 removed 12-char UUID truncation from container/volume names.
+        # E2E scripts must use FULL workspace IDs. This fail-closed guard
+        # prevents regressions where a new/modified script reintroduces the
+        # old truncated-name pattern (the root cause of SEV #2499).
+        run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh
+
      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Test ECR promote-tenant-image script (mock-driven, no live infra)
        # Covers scripts/promote-tenant-image.sh — the codified
@@ -404,6 +412,14 @@ jobs:
        run: |
          bash scripts/test-promote-tenant-image.sh

+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
+        name: Drift guard — KI-013 container/volume naming (SEV #2499)
+        # KI-013 removed 12-char UUID truncation from container/volume names.
+        # E2E scripts must use FULL workspace IDs. This fail-closed guard
+        # prevents regressions where a new/modified script reintroduces the
+        # old truncated-name pattern (the root cause of SEV #2499).
+        run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh
+
      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Shellcheck promote-tenant-image script
        # scripts/ is excluded from the bulk shellcheck pass above (legacy
@@ -500,6 +516,27 @@ jobs:
  all-required:
    # Aggregator sentinel — RFC internal#219 §2 (Phase 4 — closes internal#286).
    #
+    # ── SCOPE (read before trusting the name) ─────────────────────────
+    # "all-required" means "all of THIS workflow's (CI's) required jobs"
+    # — NOT "all of branch-protection's required checks". It is fail-
+    # closed over its `needs:` (the CI jobs below), but Gitea Actions has
+    # NO cross-workflow `needs:`, so this sentinel STRUCTURALLY CANNOT and
+    # does not cover sibling required workflows that live in their own
+    # files — notably:
+    #     • `E2E API Smoke Test`           (.gitea/workflows/e2e-api.yml)
+    #     • `Handlers Postgres Integration`(.gitea/workflows/handlers-postgres-integration.yml)
+    # Those emit their OWN status contexts and MUST be listed in branch
+    # protection `status_check_contexts` INDEPENDENTLY of this sentinel.
+    # They are today; do NOT trim BP down to just `CI / all-required` on
+    # the assumption that it covers them — it does not, and a red E2E /
+    # Handlers run would then look mergeable (observed: core PR #1086 @
+    # 9136d05a — `CI / all-required` green while E2E (id 48) + Handlers
+    # (id 47) were red; not exploitable only because BP still requires
+    # all three). The cross-workflow coverage is enforced separately by
+    # ci-required-drift.py's F4 check (every BP context must have a live
+    # emitting workflow), which is what keeps this name honest.
+    # ──────────────────────────────────────────────────────────────────
+    #
    # Emits `CI / all-required (<event>)` where <event> is the workflow trigger
    # (e.g. `CI / all-required (pull_request)`, `CI / all-required (push)`).
    # Branch protection requires the event-suffixed name —
@@ -165,6 +165,28 @@ jobs:
          cache: 'npm'
          cache-dependency-path: canvas/package-lock.json

+      - name: Sweep stale e2e-chat testcontainers (self-heal prior leaks)
+        if: needs.detect-changes.outputs.chat == 'true'
+        run: |
+          # Prior e2e-chat runs that were cancelled/killed — or whose always()
+          # cleanup hit a wedged docker daemon — leak their pg-/redis-e2e-chat-*
+          # containers, which then pile up on the shared runner host (observed: 13
+          # such containers, up to 2 weeks old, on the operator daemon). Reap any
+          # e2e-chat container older than the job window so leaks self-heal every
+          # run instead of relying on each run's own cleanup succeeding. Age-based
+          # (>2h, well beyond the 15m job) so a CONCURRENT e2e-chat job's fresh
+          # containers are never touched. See controlplane#646.
+          now=$(date -u +%s)
+          docker ps -a --filter name=e2e-chat --format '{{.Names}}' | while read -r c; do
+            [ -n "$c" ] || continue
+            created=$(docker inspect -f '{{.Created}}' "$c" 2>/dev/null) || continue
+            cts=$(date -u -d "$created" +%s 2>/dev/null) || continue
+            if [ $(( now - cts )) -gt 7200 ]; then
+              echo "sweeping stale e2e-chat container $c (created $created)"
+              timeout 30 docker rm -f "$c" >/dev/null 2>&1 || true
+            fi
+          done
+
      - name: Start Postgres (docker)
        if: needs.detect-changes.outputs.chat == 'true'
        run: |
@@ -430,5 +452,7 @@ jobs:
      - name: Stop service containers
        if: always() && needs.detect-changes.outputs.chat == 'true'
        run: |
-          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
-          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
+          # timeout-wrap so a wedged docker daemon can't hang this always() step
+          # (a hung rm here is one way containers leak in the first place).
+          timeout 30 docker rm -f "$PG_CONTAINER" 2>/dev/null || true
+          timeout 30 docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
@@ -78,6 +78,12 @@ jobs:
      # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
      MOLECULE_ENV: development
      SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
+      # act_runner runs the job inside a Docker container, so /.dockerenv exists
+      # and the platform auto-detects platformInDocker=true. But the job container
+      # is NOT on molecule-core-net, so it cannot resolve workspace container
+      # hostnames (ws-<id>:8000). Force false so the proxy keeps using the
+      # host-mapped 127.0.0.1:<ephemeral_port> URL, which IS reachable.
+      MOLECULE_IN_DOCKER: false
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
@@ -132,7 +138,29 @@ jobs:
          # jobs or stale processes from prior cancelled runs (see #2450).
          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
          echo "PORT=${PORT}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://127.0.0.1:${PORT}" >> "$GITHUB_ENV"
+          # Discover an IP that Docker containers can use to reach the host platform.
+          # host.docker.internal is not reliably available on Linux (act_runner), so
+          # workspace containers cannot resolve it and fail to register/heartbeat.
+          # Workspace containers join molecule-core-net; the host is reachable via that
+          # network's gateway. Ensure the network exists first (the provisioner creates
+          # it lazily, but we need the gateway BEFORE starting the platform).
+          docker network inspect molecule-core-net >/dev/null 2>&1 || docker network create molecule-core-net >/dev/null
+          # Parse Gateway from raw JSON because --format '{{.IPAM.Config}}' is
+          # inconsistent across Docker versions (sometimes omits Gateway field).
+          PLATFORM_HOST_IP=$(docker network inspect molecule-core-net 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
+          if [ -z "$PLATFORM_HOST_IP" ]; then
+            PLATFORM_HOST_IP=$(docker network inspect bridge 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
+          fi
+          if [ -z "$PLATFORM_HOST_IP" ]; then
+            PLATFORM_HOST_IP=$(ip route | awk '/default/ {print $3}' | head -1 || true)
+          fi
+          if [ -z "$PLATFORM_HOST_IP" ]; then
+            echo "::error::Could not determine PLATFORM_HOST_IP for Docker containers to reach the platform"
+            exit 1
+          fi
+          echo "PLATFORM_HOST_IP=${PLATFORM_HOST_IP}"
+          echo "PLATFORM_URL=http://${PLATFORM_HOST_IP}:${PORT}" >> "$GITHUB_ENV"
          # Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
          # bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
          T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
@@ -173,8 +201,10 @@ jobs:
        run: |
          # Bind to the dynamically allocated port (see #2450).
          # DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
-          # $GITHUB_ENV.
-          PORT=$PORT ./platform-server > platform.log 2>&1 &
+          # $GITHUB_ENV. PLATFORM_URL is also passed explicitly because
+          # $GITHUB_ENV propagation can be flaky on act_runner (#2468 RCA).
+          echo "starting platform with PLATFORM_URL=${PLATFORM_URL:-<fallback>} PORT=$PORT BIND_ADDR=0.0.0.0"
+          PORT=$PORT BIND_ADDR=0.0.0.0 PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:$PORT}" ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
@@ -198,6 +228,11 @@ jobs:
            sleep 1
          done

+      - name: Verify platform reachable from molecule-core-net
+        run: |
+          echo "Testing platform reachability from molecule-core-net container..."
+          docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- http://${PLATFORM_URL#http://}/health" || echo "WARN: platform not reachable from molecule-core-net"
+
      - name: Run local-provision lifecycle E2E (stub — REQUIRED)
        run: bash tests/e2e/test_local_provision_lifecycle_e2e.sh

@@ -205,6 +240,15 @@ jobs:
        if: failure()
        run: cat workspace-server/platform.log || true

+      - name: Dump workspace container logs on failure
+        if: failure()
+        run: |
+          WS_NAME=$(docker ps --filter "name=ws-" --format '{{.Names}}' | head -1 || true)
+          if [ -n "$WS_NAME" ]; then
+            echo "=== Workspace container logs for $WS_NAME ==="
+            docker logs "$WS_NAME" 2>&1 | tail -n 80 || true
+          fi
+
      - name: Stop platform
        if: always()
        run: |
@@ -248,6 +292,12 @@ jobs:
      # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
      MOLECULE_ENV: development
      SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
+      # act_runner runs the job inside a Docker container, so /.dockerenv exists
+      # and the platform auto-detects platformInDocker=true. But the job container
+      # is NOT on molecule-core-net, so it cannot resolve workspace container
+      # hostnames (ws-<id>:8000). Force false so the proxy keeps using the
+      # host-mapped 127.0.0.1:<ephemeral_port> URL, which IS reachable.
+      MOLECULE_IN_DOCKER: false
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
@@ -297,7 +347,29 @@ jobs:
          # jobs or stale processes from prior cancelled runs (see #2450).
          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
          echo "PORT=${PORT}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://127.0.0.1:${PORT}" >> "$GITHUB_ENV"
+          # Discover an IP that Docker containers can use to reach the host platform.
+          # host.docker.internal is not reliably available on Linux (act_runner), so
+          # workspace containers cannot resolve it and fail to register/heartbeat.
+          # Workspace containers join molecule-core-net; the host is reachable via that
+          # network's gateway. Ensure the network exists first (the provisioner creates
+          # it lazily, but we need the gateway BEFORE starting the platform).
+          docker network inspect molecule-core-net >/dev/null 2>&1 || docker network create molecule-core-net >/dev/null
+          # Parse Gateway from raw JSON because --format '{{.IPAM.Config}}' is
+          # inconsistent across Docker versions (sometimes omits Gateway field).
+          PLATFORM_HOST_IP=$(docker network inspect molecule-core-net 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
+          if [ -z "$PLATFORM_HOST_IP" ]; then
+            PLATFORM_HOST_IP=$(docker network inspect bridge 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
+          fi
+          if [ -z "$PLATFORM_HOST_IP" ]; then
+            PLATFORM_HOST_IP=$(ip route | awk '/default/ {print $3}' | head -1 || true)
+          fi
+          if [ -z "$PLATFORM_HOST_IP" ]; then
+            echo "::error::Could not determine PLATFORM_HOST_IP for Docker containers to reach the platform"
+            exit 1
+          fi
+          echo "PLATFORM_HOST_IP=${PLATFORM_HOST_IP}"
+          echo "PLATFORM_URL=http://${PLATFORM_HOST_IP}:${PORT}" >> "$GITHUB_ENV"
          T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
          echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
@@ -329,7 +401,8 @@ jobs:
      - name: Start platform (background)
        working-directory: workspace-server
        run: |
-          PORT=$PORT ./platform-server > platform.log 2>&1 &
+          echo "starting platform with PLATFORM_URL=${PLATFORM_URL:-<fallback>} PORT=$PORT BIND_ADDR=0.0.0.0"
+          PORT=$PORT BIND_ADDR=0.0.0.0 PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:$PORT}" ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
@@ -351,6 +424,11 @@ jobs:
            sleep 1
          done

+      - name: Verify platform reachable from molecule-core-net
+        run: |
+          echo "Testing platform reachability from molecule-core-net container..."
+          docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- http://${PLATFORM_URL#http://}/health" || echo "WARN: platform not reachable from molecule-core-net"
+
      - name: Run local-provision lifecycle E2E (real image + MiniMax LLM — ADVISORY)
        env:
          # LIFECYCLE_LLM=minimax: provision the REAL claude-code template image
@@ -375,6 +453,15 @@ jobs:
        if: failure()
        run: cat workspace-server/platform.log || true

+      - name: Dump workspace container logs on failure
+        if: failure()
+        run: |
+          WS_NAME=$(docker ps --filter "name=ws-" --format '{{.Names}}' | head -1 || true)
+          if [ -n "$WS_NAME" ]; then
+            echo "=== Workspace container logs for $WS_NAME ==="
+            docker logs "$WS_NAME" 2>&1 | tail -n 80 || true
+          fi
+
      - name: Stop platform
        if: always()
        run: |
@@ -190,6 +190,26 @@ jobs:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd  # v4.0.0

+      # Docker layer cache (registry-backed). Each builder created by
+      # setup-buildx-action is an EPHEMERAL docker-container builder (fresh
+      # buildkit state every run), so without an external cache every main
+      # push rebuilds `go mod download` / npm layers from scratch — this job
+      # class is the slowest in CI (p50 228s, ~175 runs/wk). We export the
+      # build cache to a dedicated moving ECR tag (`:buildcache`, never a
+      # deploy tag) and import it on the next run, regardless of which
+      # runner/builder picks the job.
+      #   - mode=max: caches intermediate (builder-stage) layers too — the
+      #     final stage is a tiny alpine/distroless copy, so min mode would
+      #     cache nothing useful.
+      #   - image-manifest=true,oci-mediatypes=true: required for ECR, which
+      #     rejects the raw buildkit cache-manifest mediatype. Verified by a
+      #     real export+import round-trip against ECR on the publish host
+      #     (2026-06-09) before this change.
+      #   - ignore-error=true on cache-to: a cache EXPORT failure must never
+      #     fail the publish lane; worst case the next run is cold.
+      #   - cache-from on a missing tag (first run) is a warning, not an error.
+      #   - Concurrent publishes overwrite :buildcache last-writer-wins —
+      #     same best-effort semantics as :staging-latest.
      - name: Build & push platform image to ECR (staging-<sha> + staging-latest)
        env:
          IMAGE_NAME: ${{ env.IMAGE_NAME }}
@@ -212,6 +232,8 @@ jobs:
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
            --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
+            --cache-from "type=registry,ref=${IMAGE_NAME}:buildcache" \
+            --cache-to "type=registry,ref=${IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
            --tag "${IMAGE_NAME}:${TAG_SHA}" \
            --tag "${IMAGE_NAME}:${TAG_LATEST}" \
            --push .
@@ -251,6 +273,11 @@ jobs:
          # Retry loop: buildkit EOF (internal#2468) is often transient on the
          # publish runner under memory pressure. Up to 3 attempts with a fresh
          # builder each time so a crashed buildkit doesn't poison the next try.
+          # Registry layer cache (see platform-image step comment for the full
+          # rationale): the fresh-builder-per-attempt pattern means there is
+          # NEVER local cache here — cache-from gives retries AND the next run
+          # a warm start. Cache lives on the PRIMARY ECR only (the staging
+          # mirror is a push target, not a cache source).
          for attempt in 1 2 3; do
            echo "::notice::Tenant image build attempt ${attempt}/3 ..."
            builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}"
@@ -264,6 +291,8 @@ jobs:
                --label "org.opencontainers.image.revision=${GIT_SHA}" \
                --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
                --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
+                --cache-from "type=registry,ref=${TENANT_IMAGE_NAME}:buildcache" \
+                --cache-to "type=registry,ref=${TENANT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
                "${build_tags[@]}" \
                --push .; then
              docker buildx rm "${builder}" >/dev/null 2>&1 || true
@@ -530,7 +559,20 @@ jobs:
          STALE_COUNT=0
          UNREACHABLE_COUNT=0
          UNHEALTHY_COUNT=0
+          QUARANTINED_COUNT=0
+          # Quarantined stragglers: the CP shipped the build to the healthy
+          # majority and quarantined a small minority within tolerance
+          # (max_stragglers). They are reported + recovered SEPARATELY, so they
+          # must not red the strict per-tenant verify — otherwise one stuck
+          # tenant blocks the whole deploy, the all-or-nothing trap this fixes.
+          STRAGGLERS_LIST="$(jq -r '(.stragglers // [])[]' "$RESP" 2>/dev/null || true)"
+          is_straggler() { printf '%s\n' "$STRAGGLERS_LIST" | grep -qxF "$1"; }
          for slug in "${SLUGS[@]}"; do
+            if is_straggler "$slug"; then
+              echo "::warning::$slug is a QUARANTINED straggler — build shipped to the rest of the fleet; this tenant needs individual recovery. Skipping strict verify."
+              QUARANTINED_COUNT=$((QUARANTINED_COUNT + 1))
+              continue
+            fi
            healthz_ok="$(jq -r --arg slug "$slug" '.results[]? | select(.slug == $slug) | .healthz_ok' "$RESP" | tail -1)"
            if [ "$healthz_ok" != "true" ]; then
              echo "::error::$slug did not report healthz_ok=true in redeploy-fleet response."
@@ -580,6 +622,7 @@ jobs:
            echo "Stale tenants: $STALE_COUNT"
            echo "Unhealthy tenants: $UNHEALTHY_COUNT"
            echo "Unreachable tenants: $UNREACHABLE_COUNT"
+            echo "Quarantined stragglers (shipped past; need recovery): $QUARANTINED_COUNT"
          } >> "$GITHUB_STEP_SUMMARY"

          if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then
@@ -40,14 +40,24 @@ export function FlightEnvelope({
    if (!el || typeof el.animate !== "function") return;
    const dx = to.x - from.x;
    const dy = to.y - from.y;
+    // Launch small from the source dot, GROW BIG as it crosses the gap (peak
+    // mid-flight), then SHRINK small as it lands on the target dot — reads as an
+    // envelope flung from one agent and received by the other. translate tracks
+    // the straight path (fraction == keyframe offset); scale arcs independently.
+    const at = (frac: number, scale: number, opacity: number, offset?: number) => ({
+      transform: `translate(-50%,-50%) translate(${dx * frac}px,${dy * frac}px) scale(${scale})`,
+      opacity,
+      ...(offset === undefined ? {} : { offset }),
+    });
    const anim = el.animate(
      [
-        { transform: "translate(-50%,-50%) translate(0px,0px) scale(0.45)", opacity: 0 },
-        { opacity: 1, offset: 0.16 },
-        { opacity: 1, offset: 0.8 },
-        { transform: `translate(-50%,-50%) translate(${dx}px,${dy}px) scale(1)`, opacity: 0 },
+        at(0, 0.5, 0),
+        at(0.2, 1.25, 1, 0.2), // faded in + grown
+        at(0.5, 1.7, 1, 0.5), // BIG at mid-flight
+        at(0.82, 1.05, 1, 0.82), // shrinking on approach
+        at(1, 0.5, 0), // small + faded out, arrived on the target dot
      ],
-      { duration: FLIGHT_DURATION_MS, easing: "cubic-bezier(0.45, 0, 0.25, 1)", fill: "forwards" },
+      { duration: FLIGHT_DURATION_MS, easing: "ease-in-out", fill: "forwards" },
    );
    return () => anim.cancel();
  }, [from.x, from.y, to.x, to.y]);
@@ -4,17 +4,25 @@
 *  Mounted INSIDE <ReactFlow> so its ViewportPortal places the envelope in flow
 *  coordinates; it therefore pans and zooms with the canvas for free. The
 *  flight lifecycle (which events become envelopes, reduced-motion opt-out,
- *  expiry) lives in useA2AFlights — this component only resolves node centres
- *  and renders. */
-import { ViewportPortal, type Node } from "@xyflow/react";
+ *  expiry) lives in useA2AFlights — this component only resolves endpoints and
+ *  renders.
+ *
+ *  Endpoints anchor on each workspace's STATUS DOT (the green/glowing presence
+ *  indicator), not the card's geometric centre — so an envelope visibly leaves
+ *  the source agent's dot and lands on the target agent's dot. The dot carries
+ *  `data-flight-anchor`; we read its rendered rect and convert screen→flow via
+ *  React Flow, falling back to the card centre only when the dot isn't in the
+ *  DOM yet (node just mounted / scrolled out). */
+import { useRef } from "react";
+import { ViewportPortal, useReactFlow, type Node } from "@xyflow/react";
 import { useCanvasStore } from "@/store/canvas";
-import { useA2AFlights } from "@/hooks/useA2AFlights";
+import { useA2AFlights, type A2AFlight } from "@/hooks/useA2AFlights";
 import { FlightEnvelope, type Point } from "./FlightEnvelope";
 import type { WorkspaceNodeData } from "@/store/canvas";

 // Fallback node footprint when React Flow has not measured a node yet. Matches
-// WorkspaceNode's leaf size (w-[300px] min-h-[176px]); a slightly-off centre
-// for the first frame after mount is invisible at flight scale.
+// WorkspaceNode's leaf size (w-[300px] min-h-[176px]); a slightly-off centre for
+// the first frame after mount is invisible at flight scale.
 const DEFAULT_W = 300;
 const DEFAULT_H = 176;

@@ -24,23 +32,76 @@ function nodeCenter(n: Node<WorkspaceNodeData>): Point {
  return { x: n.position.x + w / 2, y: n.position.y + h / 2 };
 }

+/** Resolve a node's status-dot centre in FLOW coordinates. Reads the dot's
+ *  rendered screen rect (it carries data-flight-anchor) and converts it back to
+ *  flow space, so the anchor is exact regardless of pan/zoom and survives any
+ *  header-layout change. Falls back to the card centre when the dot isn't
+ *  rendered. */
+function dotAnchor(
+  n: Node<WorkspaceNodeData>,
+  screenToFlowPosition: (p: Point) => Point,
+): Point {
+  if (typeof document !== "undefined") {
+    const id =
+      typeof CSS !== "undefined" && typeof CSS.escape === "function" ? CSS.escape(n.id) : n.id;
+    const el = document.querySelector<HTMLElement>(
+      `.react-flow__node[data-id="${id}"] [data-flight-anchor]`,
+    );
+    if (el) {
+      const r = el.getBoundingClientRect();
+      if (r.width > 0 && r.height > 0) {
+        return screenToFlowPosition({ x: r.left + r.width / 2, y: r.top + r.height / 2 });
+      }
+    }
+  }
+  return nodeCenter(n);
+}
+
+/** One flight. Captures the source/target dot anchors ONCE on mount (a ref, not
+ *  per-render) so a pan/zoom or re-render mid-flight doesn't restart the
+ *  animation — mirrors HomeFlight's capture-once contract. */
+function CanvasFlight({
+  flight,
+  nodes,
+  screenToFlowPosition,
+}: {
+  flight: A2AFlight;
+  nodes: Node<WorkspaceNodeData>[];
+  screenToFlowPosition: (p: Point) => Point;
+}) {
+  const pos = useRef<{ from: Point; to: Point } | null>(null);
+  if (pos.current === null) {
+    const src = nodes.find((n) => n.id === flight.sourceId);
+    const dst = nodes.find((n) => n.id === flight.targetId);
+    // Both endpoints must be on-canvas to draw a path between them.
+    if (src && dst) {
+      pos.current = {
+        from: dotAnchor(src, screenToFlowPosition),
+        to: dotAnchor(dst, screenToFlowPosition),
+      };
+    }
+  }
+  if (!pos.current) return null;
+  return <FlightEnvelope from={pos.current.from} to={pos.current.to} kind={flight.kind} />;
+}
+
 export function MessageFlightLayer() {
  const flights = useA2AFlights();
-  const nodes = useCanvasStore((s) => s.nodes);
+  const nodes = useCanvasStore((s) => s.nodes) as Node<WorkspaceNodeData>[];
+  const { screenToFlowPosition } = useReactFlow();

  if (flights.length === 0) return null;

  return (
    <ViewportPortal>
-      {flights.map((f) => {
-        const src = nodes.find((n) => n.id === f.sourceId);
-        const dst = nodes.find((n) => n.id === f.targetId);
-        // Both endpoints must be on-canvas to draw a path between them.
-        if (!src || !dst) return null;
-        return (
-          <FlightEnvelope key={f.key} from={nodeCenter(src)} to={nodeCenter(dst)} kind={f.kind} />
-        );
-      })}
+      {flights.map((f) => (
+        <CanvasFlight
+          key={f.key}
+          flight={f}
+          nodes={nodes}
+          screenToFlowPosition={screenToFlowPosition}
+        />
+      ))}
    </ViewportPortal>
  );
 }
@@ -215,7 +215,7 @@ export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>)
        {/* Header row */}
        <div className="flex items-center justify-between gap-2 mb-2.5">
          <div className="flex items-center gap-2.5 min-w-0">
-            <div className={`w-2.5 h-2.5 rounded-full shrink-0 ${statusCfg.dot} ${statusCfg.glow} shadow-sm`} />
+            <div data-flight-anchor className={`w-2.5 h-2.5 rounded-full shrink-0 ${statusCfg.dot} ${statusCfg.glow} shadow-sm`} />
            <span className="text-[15px] font-semibold text-ink truncate leading-tight">
              {data.name}
            </span>
@@ -0,0 +1,54 @@
+// @vitest-environment jsdom
+/**
+ * Tests for FlightEnvelope — the envelope that animates from `from` to `to`.
+ *
+ * Locks the render contract the canvas + concierge-home both depend on:
+ *  - the envelope is positioned at the `from` point (its launch anchor),
+ *  - it is coloured by activity kind,
+ *  - it degrades gracefully when Element.animate is unavailable (jsdom / SSR).
+ *
+ * The grow→shrink scale arc itself uses the Web Animations API, which jsdom
+ * does not implement, so we assert the static render + graceful degradation
+ * rather than keyframe values.
+ */
+import React from "react";
+import { render, cleanup } from "@testing-library/react";
+import { afterEach, describe, expect, it } from "vitest";
+import { FlightEnvelope } from "../FlightEnvelope";
+
+afterEach(cleanup);
+
+describe("FlightEnvelope", () => {
+  it("positions the envelope at the `from` launch point", () => {
+    const { getByTestId } = render(
+      <FlightEnvelope from={{ x: 120, y: 240 }} to={{ x: 400, y: 60 }} kind="send" />,
+    );
+    const el = getByTestId("flight-envelope");
+    expect(el.style.left).toBe("120px");
+    expect(el.style.top).toBe("240px");
+    expect(el.querySelector("svg")).toBeTruthy();
+  });
+
+  it("colours the envelope by activity kind", () => {
+    const stroke = (kind: "send" | "receive" | "task") => {
+      const { container } = render(
+        <FlightEnvelope from={{ x: 0, y: 0 }} to={{ x: 10, y: 10 }} kind={kind} />,
+      );
+      const s = container.querySelector("rect")?.getAttribute("stroke");
+      cleanup();
+      return s;
+    };
+    expect(stroke("send")).toBe("#22d3ee");
+    expect(stroke("receive")).toBe("#8b5cf6");
+    expect(stroke("task")).toBe("#f5a623");
+  });
+
+  it("degrades to a static render (no throw) when Element.animate is unavailable", () => {
+    // jsdom does not implement Element.animate — the component must still render.
+    expect(typeof document.createElement("div").animate).not.toBe("function");
+    const { getByTestId } = render(
+      <FlightEnvelope from={{ x: 0, y: 0 }} to={{ x: 1, y: 1 }} kind="task" />,
+    );
+    expect(getByTestId("flight-envelope")).toBeTruthy();
+  });
+});
@@ -82,8 +82,19 @@
 /* ===== MAIN ===== */
 .main { flex: 1; display: flex; flex-direction: column; min-width: 0; }
 .topbar { height: 56px; flex: 0 0 56px; border-bottom: 1px solid var(--hair); background: var(--panel); display: flex; align-items: center; justify-content: space-between; padding: 0 18px 0 20px; }
-.org { display: flex; align-items: center; gap: 10px; cursor: pointer; padding: 6px 10px; border-radius: 9px; transition: .16s; margin-left: -6px; }
+.org { position: relative; display: flex; align-items: center; gap: 10px; cursor: pointer; padding: 6px 10px; border-radius: 9px; transition: .16s; margin-left: -6px; }
 .org:hover { background: var(--hair); }
+/* Org switcher dropdown */
+.orgMenu { position: absolute; top: calc(100% + 6px); left: 0; min-width: 220px; max-height: 320px; overflow-y: auto; padding: 5px; background: var(--bg-1, #1a1a22); border: 1px solid var(--hair-2); border-radius: 11px; box-shadow: 0 12px 32px rgba(0,0,0,.4); z-index: 50; }
+.orgMenuItem { width: 100%; display: flex; align-items: center; gap: 9px; padding: 7px 9px; border: none; background: transparent; border-radius: 8px; cursor: pointer; color: var(--tx-1); font-size: 13.5px; font-weight: 500; text-align: left; transition: .12s; }
+.orgMenuItem:hover { background: var(--hair); }
+.orgMenuCurrent { font-weight: 700; }
+.orgMenuBadge { width: 20px; height: 20px; border-radius: 6px; display: grid; place-items: center; background: linear-gradient(150deg,#2d2d36,#3a3a46); font-size: 11px; font-weight: 700; color: #d8d8e2; border: 1px solid var(--hair-2); flex: 0 0 auto; }
+:global([data-theme="light"]) .orgMenuBadge { background: linear-gradient(150deg,#7c3aed,#a78bfa); color: #fff; border: none; }
+.orgMenuName { flex: 1 1 auto; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+.orgMenuTick { color: var(--accent, #a78bfa); display: flex; flex: 0 0 auto; }
+.orgMenuTick svg { width: 14px; height: 14px; }
+.orgMenuEmpty { padding: 9px 11px; color: var(--tx-3); font-size: 13px; }
 .orgBadge { width: 24px; height: 24px; border-radius: 7px; display: grid; place-items: center; background: linear-gradient(150deg,#2d2d36,#3a3a46); font-size: 12px; font-weight: 700; color: #d8d8e2; border: 1px solid var(--hair-2); }
 :global([data-theme="light"]) .orgBadge { background: linear-gradient(150deg,#7c3aed,#a78bfa); color: #fff; border: none; }
 .orgName { font-weight: 600; font-size: 14.5px; letter-spacing: -.01em; }
@@ -4,7 +4,8 @@ import { useCallback, useEffect, useMemo, useState } from "react";
 import { useCanvasStore, type TopView } from "@/store/canvas";
 import { WORKSPACE_KIND } from "@/lib/workspace-kind";
 import { useTheme } from "@/lib/theme-provider";
-import { api } from "@/lib/api";
+import { api, PLATFORM_URL } from "@/lib/api";
+import { switchOrgUrl } from "@/lib/org-switch";
 import { showToast } from "@/components/Toaster";
 import type { ActivityEntry } from "@/types/activity";
 import { Canvas } from "@/components/Canvas";
@@ -89,6 +90,25 @@ function activityText(a: ActivityEntry): string {
  return a.method ? `${verb} · ${a.method}` : verb;
 }

+
+/**
+ * resolveHomeChatTarget — which agent the Home chat panel talks to: the
+ * sidebar-selected node when it still exists, else the org root (concierge),
+ * else null. Resolving against live nodes means a deleted/vanished selection
+ * degrades to the root instead of a dead chat. Exported for unit tests.
+ */
+export function resolveHomeChatTarget<N extends { id: string }>(
+  nodes: N[],
+  selectedNodeId: string | null,
+  platformRoot: N | null,
+): N | null {
+  if (selectedNodeId) {
+    const selected = nodes.find((n) => n.id === selectedNodeId);
+    if (selected) return selected;
+  }
+  return platformRoot ?? null;
+}
+
 export function ConciergeShell() {
  const nodes = useCanvasStore((st) => st.nodes);
  const topView = useCanvasStore((st) => st.topView);
@@ -108,13 +128,18 @@ export function ConciergeShell() {
  // returns an empty name, so the topbar never breaks before the backend
  // lands.
  const [orgName, setOrgName] = useState("Molecule AI");
+  // Current org slug (from GET /org/identity) — used to highlight the active
+  // org in the switcher and to derive the apex domain for cross-org navigation.
+  const [orgSlug, setOrgSlug] = useState("");
  useEffect(() => {
    let cancelled = false;
    api
-      .get<{ name?: string }>("/org/identity")
+      .get<{ name?: string; slug?: string }>("/org/identity")
      .then((r) => {
        const name = (r?.name || "").trim();
        if (!cancelled && name) setOrgName(name);
+        const slug = (r?.slug || "").trim();
+        if (!cancelled && slug) setOrgSlug(slug);
      })
      .catch(() => {
        // No endpoint / not reachable — keep the "Molecule AI" fallback.
@@ -124,6 +149,47 @@ export function ConciergeShell() {
    };
  }, []);

+  // --- Org switcher (topbar dropdown) ---
+  // Each org is its own tenant subdomain, so "switch" = navigate to
+  // <slug>.<apex>. The org list comes from the control plane (cross-origin,
+  // cookie-auth), fetched lazily the first time the menu opens.
+  const [orgMenuOpen, setOrgMenuOpen] = useState(false);
+  const [orgs, setOrgs] = useState<Array<{ slug: string; name?: string; id?: string }> | null>(null);
+  const toggleOrgMenu = useCallback(() => {
+    setOrgMenuOpen((open) => {
+      const next = !open;
+      if (next && orgs === null) {
+        fetch(`${PLATFORM_URL}/cp/orgs`, {
+          credentials: "include",
+          signal: AbortSignal.timeout(15_000),
+        })
+          .then((res) => (res.ok ? res.json() : Promise.reject(new Error(String(res.status)))))
+          .then((body: { orgs?: Array<{ slug: string; name?: string; id?: string }> } | Array<{ slug: string; name?: string; id?: string }>) => {
+            const list = Array.isArray(body) ? body : body.orgs ?? [];
+            setOrgs(list.filter((o) => o && o.slug));
+          })
+          .catch(() => setOrgs([])); // no list / not reachable → render "no other orgs"
+      }
+      return next;
+    });
+  }, [orgs]);
+  const switchOrg = useCallback(
+    (slug: string) => {
+      setOrgMenuOpen(false);
+      if (typeof window === "undefined") return;
+      const url = switchOrgUrl(window.location.hostname, window.location.protocol, orgSlug, slug);
+      if (url) window.location.href = url;
+    },
+    [orgSlug]
+  );
+  // Close the menu on any outside click.
+  useEffect(() => {
+    if (!orgMenuOpen) return;
+    const onDoc = () => setOrgMenuOpen(false);
+    document.addEventListener("click", onDoc);
+    return () => document.removeEventListener("click", onDoc);
+  }, [orgMenuOpen]);
+
  // Build the agent hierarchy from live nodes.
  const { roots, childrenOf } = useMemo(() => {
    const childrenOf = new Map<string, typeof nodes>();
@@ -157,6 +223,16 @@ export function ConciergeShell() {

  const platformId = platformRoot?.id ?? null;

+  // Home chat target: the agent SELECTED in the sidebar, falling back to the
+  // org root (the concierge). Pre-fix the panel was hard-pointed at the root,
+  // so clicking another agent highlighted it but the chat never switched.
+  const chatNode = useMemo(
+    () => resolveHomeChatTarget(nodes, selectedNodeId, platformRoot),
+    [nodes, selectedNodeId, platformRoot],
+  );
+  const chatId = chatNode?.id ?? null;
+  const chatIsRoot = chatId !== null && chatId === platformId;
+
  // ── live data: approvals + user-tasks (org-wide), activity (platform agent) ──
  const [approvals, setApprovals] = useState<PendingApproval[]>([]);
  const [userTasks, setUserTasks] = useState<UserTask[]>([]);
@@ -330,10 +406,51 @@ export function ConciergeShell() {
        <div className={s.main}>
          {/* TOPBAR */}
          <header className={s.topbar}>
-            <div className={s.org}>
+            <div
+              className={s.org}
+              role="button"
+              tabIndex={0}
+              aria-haspopup="menu"
+              aria-expanded={orgMenuOpen}
+              data-testid="topbar-org-switcher"
+              onClick={(e) => {
+                e.stopPropagation();
+                toggleOrgMenu();
+              }}
+            >
              <div className={s.orgBadge}>{initials(orgName).slice(0, 1)}</div>
              <span data-testid="topbar-org-name" className={s.orgName}>{orgName}</span>
              <span className={s.chev}><IcChevDown /></span>
+              {orgMenuOpen && (
+                <div
+                  className={s.orgMenu}
+                  role="menu"
+                  data-testid="topbar-org-menu"
+                  onClick={(e) => e.stopPropagation()}
+                >
+                  {orgs === null ? (
+                    <div className={s.orgMenuEmpty}>Loading…</div>
+                  ) : orgs.length === 0 ? (
+                    <div className={s.orgMenuEmpty}>No other organizations</div>
+                  ) : (
+                    orgs.map((o) => (
+                      <button
+                        key={o.id || o.slug}
+                        type="button"
+                        role="menuitem"
+                        className={`${s.orgMenuItem} ${o.slug === orgSlug ? s.orgMenuCurrent : ""}`}
+                        onClick={() => switchOrg(o.slug)}
+                      >
+                        <span className={s.orgMenuBadge}>{initials(o.name || o.slug).slice(0, 1)}</span>
+                        <span className={s.orgMenuName}>{o.name || o.slug}</span>
+                        {o.slug === orgSlug && (
+                          <span className={s.orgMenuTick}><IcCheck /></span>
+                        )}
+                      </button>
+                    ))
+                  )}
+                </div>
+              )}
            </div>
            <div className={s.topbarRight}>
              <button className={s.iconPill} title="Search"><IcSearch /></button>
@@ -457,24 +574,24 @@ export function ConciergeShell() {
                  delivery-mode handling), pointed at the platform agent. A thin
                  concierge-styled header keeps the Home look; the ChatTab body
                  below is identical to the map path so features can't drift. */}
-              {platformId && platformRoot ? (
+              {chatId && chatNode ? (
                <section className={s.chat}>
                  <div className={s.chatHead}>
                    <div className={s.chAv}><IcChat /></div>
                    <div className={s.chMeta}>
-                      <div className={s.chTitle}>{platformRoot.data.name ?? "Org Concierge"}</div>
+                      <div className={s.chTitle}>{chatNode.data.name ?? (chatIsRoot ? "Org Concierge" : "Agent")}</div>
                      <div className={s.chSub}>
                        {(() => {
                          const online =
-                            platformRoot.data.status === "online" ||
-                            platformRoot.data.status === "degraded";
+                            chatNode.data.status === "online" ||
+                            chatNode.data.status === "degraded";
                          return (
                            <>
                              <span
                                className={s.sdot}
                                style={{ background: online ? "var(--green)" : "var(--grey)" }}
                              />
-                              {online ? "online" : statusInfo(platformRoot.data.status ?? "").label} · platform agent
+                              {online ? "online" : statusInfo(chatNode.data.status ?? "").label} · {chatIsRoot ? "platform agent" : (chatNode.data.role || "agent")}
                            </>
                          );
                        })()}
@@ -482,7 +599,9 @@ export function ConciergeShell() {
                    </div>
                  </div>
                  <div className={s.embedChat}>
-                    <ChatTab key={platformId} workspaceId={platformId} data={platformRoot.data} />
+                    {/* key=chatId remounts ChatTab on selection change so the
+                        history/composer state never bleeds between agents. */}
+                    <ChatTab key={chatId} workspaceId={chatId} data={chatNode.data} />
                  </div>
                </section>
              ) : (
@@ -0,0 +1,26 @@
+// Home chat panel target — selecting an agent in the sidebar switches the
+// chat; the root is only the DEFAULT, not a hard-point (the pre-fix bug).
+import { describe, it, expect } from "vitest";
+import { resolveHomeChatTarget } from "../ConciergeShell";
+
+const root = { id: "root" };
+const child = { id: "child" };
+const nodes = [root, child];
+
+describe("resolveHomeChatTarget", () => {
+  it("returns the selected agent when it exists (the bug: chat stayed on root)", () => {
+    expect(resolveHomeChatTarget(nodes, "child", root)).toBe(child);
+  });
+  it("falls back to the platform root when nothing is selected", () => {
+    expect(resolveHomeChatTarget(nodes, null, root)).toBe(root);
+  });
+  it("degrades to the root when the selection no longer exists (deleted agent)", () => {
+    expect(resolveHomeChatTarget(nodes, "gone", root)).toBe(root);
+  });
+  it("selecting the root itself targets the root", () => {
+    expect(resolveHomeChatTarget(nodes, "root", root)).toBe(root);
+  });
+  it("null when there is neither selection nor root", () => {
+    expect(resolveHomeChatTarget([], null, null)).toBeNull();
+  });
+});
@@ -7,29 +7,44 @@ import { isSaaSTenant } from "@/lib/tenant";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import type { WorkspaceCompute } from "@/store/socket";

-// Machine sizes keyed by cloud provider — an AWS t3.* is meaningless on Hetzner,
-// etc. MUST mirror the workspace-server workspaceComputeInstanceAllowlist (which
-// mirrors the CP provider configs); the PATCH validation rejects a mismatch 400.
-const INSTANCE_TYPES_BY_PROVIDER: Record<string, string[]> = {
-  aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
-  hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
-  gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
+// Cloud-provider + instance-type metadata (core#2489).
+//
+// SSOT lives in the workspace-server (workspace_compute.go's allowlist + defaults)
+// and is fetched at runtime from GET /workspaces/:id/compute-options, so the UI
+// can never offer a (provider, instance-type) the PATCH validation then rejects
+// with a 400. The constants below are ONLY a minimal offline fallback used until
+// the fetch resolves (or if it fails) — they mirror the server SSOT but are not
+// the source of truth. When the fetch succeeds, its data replaces them entirely.
+type ComputeOptions = {
+  providers: string[];
+  instanceTypes: Record<string, string[]>;
+  defaults: Record<string, string>;
 };
-const DEFAULT_INSTANCE_BY_PROVIDER: Record<string, string> = {
-  aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2",
-};
-const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
-const instanceTypesForProvider = (p?: string): string[] =>
-  INSTANCE_TYPES_BY_PROVIDER[normalizeProvider(p)] ?? INSTANCE_TYPES_BY_PROVIDER.aws;
-const defaultInstanceForProvider = (p?: string): string =>
-  DEFAULT_INSTANCE_BY_PROVIDER[normalizeProvider(p)] ?? "t3.medium";

-// Editable cloud-provider options (multi-provider RFC) — mirrors CreateWorkspaceDialog.
-const CLOUD_PROVIDER_OPTIONS = [
-  { value: "aws", label: "AWS (default)" },
-  { value: "gcp", label: "GCP" },
-  { value: "hetzner", label: "Hetzner" },
-];
+const FALLBACK_COMPUTE_OPTIONS: ComputeOptions = {
+  providers: ["aws", "hetzner", "gcp"],
+  instanceTypes: {
+    aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
+    hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
+    gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
+  },
+  defaults: { aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2" },
+};
+
+const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
+const instanceTypesForProvider = (opts: ComputeOptions, p?: string): string[] =>
+  opts.instanceTypes[normalizeProvider(p)] ?? opts.instanceTypes.aws ?? FALLBACK_COMPUTE_OPTIONS.instanceTypes.aws;
+const defaultInstanceForProvider = (opts: ComputeOptions, p?: string): string =>
+  opts.defaults[normalizeProvider(p)] ?? "t3.medium";
+
+// Human labels for the cloud-provider selector. The option VALUES come from the
+// fetched SSOT (opts.providers); this only supplies display text + the default tag.
+const CLOUD_PROVIDER_LABELS: Record<string, string> = {
+  aws: "AWS (default)",
+  gcp: "GCP",
+  hetzner: "Hetzner",
+};
+const cloudProviderOptionLabel = (v: string): string => CLOUD_PROVIDER_LABELS[v] ?? v;

 const RUNTIME_OPTIONS = ["claude-code", "codex", "hermes", "openclaw", "kimi", "kimi-cli", "external"];
 const RESOLUTIONS = ["1280x720", "1440x900", "1920x1080", "2560x1440"];
@@ -87,6 +102,12 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
  const [saving, setSaving] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [success, setSuccess] = useState(false);
+  // core#2489: provider + instance-type dropdowns are populated from the
+  // workspace-server SSOT (GET /workspaces/:id/compute-options) so they can't
+  // drift from what the PATCH validation accepts. Start from the offline fallback
+  // and replace it once the fetch resolves; on fetch error we keep the fallback
+  // (the dropdowns still work, just from the in-bundle mirror).
+  const [computeOptions, setComputeOptions] = useState<ComputeOptions>(FALLBACK_COMPUTE_OPTIONS);

  useEffect(() => {
    setForm(initial);
@@ -94,6 +115,30 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
    setSuccess(false);
  }, [initial]);

+  useEffect(() => {
+    let cancelled = false;
+    (async () => {
+      try {
+        const opts = await api.get<Partial<ComputeOptions>>(`/workspaces/${workspaceId}/compute-options`);
+        if (cancelled) return;
+        // Defensive: only adopt a well-formed payload; otherwise keep the fallback.
+        if (opts && Array.isArray(opts.providers) && opts.providers.length > 0 && opts.instanceTypes && opts.defaults) {
+          setComputeOptions({
+            providers: opts.providers,
+            instanceTypes: opts.instanceTypes,
+            defaults: opts.defaults,
+          });
+        }
+      } catch {
+        // Fetch failed (offline / older server) — keep FALLBACK_COMPUTE_OPTIONS.
+        // The dropdowns stay usable; worst case they show the in-bundle mirror.
+      }
+    })();
+    return () => {
+      cancelled = true;
+    };
+  }, [workspaceId]);
+
  const workspaceAccess = formatAccess(data.workspaceAccess);
  const maxConcurrentTasks = data.maxConcurrentTasks ? String(data.maxConcurrentTasks) : "platform-managed";
  const deliveryMode = data.deliveryMode || "push";
@@ -208,8 +253,8 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
              id="cloud-provider"
              label="Cloud provider"
              value={normalizeProvider(form.provider)}
-              options={CLOUD_PROVIDER_OPTIONS.map((p) => p.value)}
-              optionLabel={(v) => CLOUD_PROVIDER_OPTIONS.find((p) => p.value === v)?.label ?? v}
+              options={computeOptions.providers}
+              optionLabel={cloudProviderOptionLabel}
              // Switching cloud resets the instance type to the new provider's
              // default (an AWS t3.* is invalid on Hetzner, etc.) — also keeps the
              // instance-type dropdown below in sync with the provider's sizes.
@@ -217,9 +262,9 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
                setForm((s) => ({
                  ...s,
                  provider,
-                  instanceType: instanceTypesForProvider(provider).includes(s.instanceType)
+                  instanceType: instanceTypesForProvider(computeOptions, provider).includes(s.instanceType)
                    ? s.instanceType
-                    : defaultInstanceForProvider(provider),
+                    : defaultInstanceForProvider(computeOptions, provider),
                }))
              }
            />
@@ -228,7 +273,7 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
            id="instance-type"
            label="Instance type"
            value={form.instanceType}
-            options={instanceTypesForProvider(form.provider)}
+            options={instanceTypesForProvider(computeOptions, form.provider)}
            onChange={(instanceType) => setForm((s) => ({ ...s, instanceType }))}
          />
          <label className="grid gap-1" htmlFor="root-volume-gb">
@@ -348,7 +393,10 @@ function formFromData(data: {
  return {
    runtime: data.runtime || "claude-code",
    provider,
-    instanceType: data.instanceType || defaultInstanceForProvider(provider),
+    // Falls back to the offline default only when no instance type is persisted;
+    // the server SSOT default matches FALLBACK_COMPUTE_OPTIONS, and the dropdown
+    // re-syncs to the fetched options once they resolve.
+    instanceType: data.instanceType || defaultInstanceForProvider(FALLBACK_COMPUTE_OPTIONS, provider),
    rootGB: String(data.rootGB || DEFAULT_HEADLESS_ROOT_GB),
    displayEnabled: !!data.displayMode && data.displayMode !== "none",
    displayMode: data.displayMode && data.displayMode !== "none" ? data.displayMode : "desktop-control",
@@ -3,12 +3,14 @@ import { cleanup, fireEvent, render, screen, waitFor } from "@testing-library/re
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

 const apiPatch = vi.fn();
+const apiGet = vi.fn();
 const updateNodeData = vi.fn();
 const restartWorkspace = vi.fn();

 vi.mock("@/lib/api", () => ({
  api: {
    patch: (path: string, body: unknown) => apiPatch(path, body),
+    get: (path: string) => apiGet(path),
  },
 }));

@@ -38,6 +40,12 @@ afterEach(() => {

 beforeEach(() => {
  apiPatch.mockReset();
+  apiGet.mockReset();
+  // Default: compute-options fetch rejects → component keeps its in-bundle
+  // fallback SSOT. Existing assertions (t3.medium / cpx31 / provider list) are
+  // satisfied by the fallback, which mirrors the server. Individual tests that
+  // exercise the fetch path override this with mockResolvedValueOnce.
+  apiGet.mockRejectedValue(new Error("no compute-options in this test"));
  restartWorkspace.mockReset();
  updateNodeData.mockReset();
 });
@@ -358,6 +366,76 @@ describe("ContainerConfigTab", () => {
    confirmSpy.mockRestore();
  });

+  // core#2489: the provider + instance-type dropdowns are populated from the
+  // workspace-server SSOT (GET /workspaces/:id/compute-options), so the UI can't
+  // offer an option the backend then rejects. This proves the fetch drives the
+  // dropdowns: a server-only instance type appears once the fetch resolves.
+  it("populates instance-type options from the compute-options SSOT endpoint", async () => {
+    apiGet.mockResolvedValueOnce({
+      providers: ["aws", "hetzner", "gcp"],
+      instanceTypes: {
+        aws: ["t3.medium", "t3.large", "z9.future"], // z9.future is server-only
+        hetzner: ["cpx31"],
+        gcp: ["e2-standard-2"],
+      },
+      defaults: { aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2" },
+    });
+
+    render(
+      <ContainerConfigTab
+        workspaceId="ws-opts"
+        data={{
+          runtime: "claude-code",
+          status: "online",
+          needsRestart: false,
+          activeTasks: 0,
+          maxConcurrentTasks: null,
+          workspaceAccess: "none",
+          deliveryMode: "push",
+          compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
+        }}
+      />,
+    );
+
+    await waitFor(() => expect(apiGet).toHaveBeenCalledWith("/workspaces/ws-opts/compute-options"));
+    // The server-only instance type appears in the dropdown after the fetch.
+    await waitFor(() =>
+      expect(
+        Array.from(screen.getByLabelText("Instance type").querySelectorAll("option")).map((o) => o.getAttribute("value")),
+      ).toContain("z9.future"),
+    );
+  });
+
+  // core#2489: if the compute-options fetch fails, the dropdowns must stay usable
+  // via the in-bundle fallback (no crash, no empty selector).
+  it("falls back to the in-bundle option set when the compute-options fetch fails", async () => {
+    apiGet.mockRejectedValueOnce(new Error("network down"));
+
+    render(
+      <ContainerConfigTab
+        workspaceId="ws-opts"
+        data={{
+          runtime: "claude-code",
+          status: "online",
+          needsRestart: false,
+          activeTasks: 0,
+          maxConcurrentTasks: null,
+          workspaceAccess: "none",
+          deliveryMode: "push",
+          compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
+        }}
+      />,
+    );
+
+    await waitFor(() => expect(apiGet).toHaveBeenCalled());
+    // Fallback list still renders the known AWS sizes.
+    const values = Array.from(
+      screen.getByLabelText("Instance type").querySelectorAll("option"),
+    ).map((o) => o.getAttribute("value"));
+    expect(values).toContain("t3.medium");
+    expect(values).toContain("m6i.xlarge");
+  });
+
  it("does not treat a non-provider edit as a recreate (no confirm; aws default omitted)", async () => {
    const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
    render(
@@ -0,0 +1,84 @@
+// @vitest-environment jsdom
+//
+// jrs-auto, 2026-06-09 — "Failed to send message — agent may be unreachable"
+// after 120s WHILE the agent visibly runs tools in the activity feed.
+//
+// Mechanism: the A2A proxy holds the POST open for the agent's whole turn;
+// a long tool-calling turn outlives the 120s client budget and
+// AbortSignal.timeout fires (DOMException name="TimeoutError"). The message
+// WAS delivered — the timeout is a client-side stop-waiting, not transport
+// failure. Pre-fix the catch-all released the guards and showed the
+// unreachable banner (false alarm). Post-fix: a TimeoutError keeps the
+// thinking state (reply + guard release arrive via the AGENT_MESSAGE WS
+// event, the documented poll-mode contract); real transport errors keep
+// the failure banner.
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { renderHook, act } from "@testing-library/react";
+
+const apiPostMock = vi.fn<
+  (url: string, body?: unknown, opts?: unknown) => Promise<unknown>
+>();
+vi.mock("@/lib/api", () => ({
+  api: {
+    post: (url: string, body?: unknown, opts?: unknown) =>
+      apiPostMock(url, body, opts),
+    get: vi.fn(),
+  },
+}));
+vi.mock("../../uploads", () => ({
+  uploadChatFiles: vi.fn(),
+  FileTooLargeError: class FileTooLargeError extends Error {},
+}));
+
+import { useChatSend } from "../useChatSend";
+
+// AbortSignal.timeout rejects with a DOMException named "TimeoutError".
+const timeoutError = () => {
+  try {
+    return new DOMException("signal timed out", "TimeoutError");
+  } catch {
+    // jsdom fallback — only the .name contract matters.
+    const e = new Error("signal timed out");
+    (e as Error & { name: string }).name = "TimeoutError";
+    return e;
+  }
+};
+
+beforeEach(() => {
+  apiPostMock.mockReset();
+});
+
+describe("useChatSend — client timeout is NOT 'unreachable'", () => {
+  it("keeps sending=true and shows NO error when the 120s client timeout fires (delivered, agent still working)", async () => {
+    apiPostMock.mockRejectedValueOnce(timeoutError());
+
+    const { result } = renderHook(() =>
+      useChatSend("ws-long-turn", { getHistoryMessages: () => [] }),
+    );
+
+    await act(async () => {
+      await result.current.sendMessage("do a long multi-tool task");
+      await Promise.resolve();
+    });
+
+    expect(result.current.error).toBeNull(); // no false "unreachable" banner
+    expect(result.current.sending).toBe(true); // thinking persists until the WS reply
+  });
+
+  it("still fails loudly on a REAL transport error (non-timeout rejection)", async () => {
+    apiPostMock.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
+
+    const { result } = renderHook(() =>
+      useChatSend("ws-dead", { getHistoryMessages: () => [] }),
+    );
+
+    await act(async () => {
+      await result.current.sendMessage("hello?");
+      await Promise.resolve();
+    });
+
+    expect(result.current.error).toMatch(/unreachable/);
+    expect(result.current.sending).toBe(false); // guards released for retry
+  });
+});
@@ -252,12 +252,32 @@ export function useChatSend(workspaceId: string, options: UseChatSendOptions) {
          }
          releaseSendGuards();
        })
-        .catch(() => {
+        .catch((e: unknown) => {
          if (sendTokenRef.current !== myToken) return;
          if (!sendingFromAPIRef.current) {
            sendInFlightRef.current = false;
            return;
          }
+          // CLIENT TIMEOUT ≠ UNREACHABLE (jrs-auto, 2026-06-09). The A2A
+          // proxy holds this POST open for the agent's WHOLE turn; a long
+          // tool-calling turn routinely outlives the 120s client budget.
+          // AbortSignal.timeout firing after the server ACCEPTED and held
+          // the connection means the message was DELIVERED and the agent is
+          // still working — showing "agent may be unreachable" here is a
+          // false alarm (the user watches the agent run tools in the
+          // activity feed while the chat claims failure). Keep the thinking
+          // state up; the reply lands via the AGENT_MESSAGE WebSocket event,
+          // which releases the guards — exactly the documented poll-mode
+          // contract above. Genuine unreachability fails FAST (connection
+          // refused / 4xx / 5xx) and still takes the error branch; a truly
+          // dead agent is surfaced by the reactive-health path
+          // (maybeMarkContainerDead), not by this client timeout.
+          const isClientTimeout =
+            e !== null && typeof e === "object" &&
+            "name" in e && (e as { name: unknown }).name === "TimeoutError";
+          if (isClientTimeout) {
+            return; // delivered; reply (and guard release) arrives via WS
+          }
          releaseSendGuards();
          setError("Failed to send message — agent may be unreachable");
        });
@@ -0,0 +1,30 @@
+import { describe, it, expect } from "vitest";
+import { switchOrgUrl } from "../org-switch";
+
+describe("switchOrgUrl", () => {
+  it("builds the target org's subdomain URL from the current host", () => {
+    expect(
+      switchOrgUrl("agents-team.moleculesai.app", "https:", "agents-team", "reno-stars"),
+    ).toBe("https://reno-stars.moleculesai.app");
+  });
+
+  it("returns null for a no-op (switching to the current org)", () => {
+    expect(
+      switchOrgUrl("agents-team.moleculesai.app", "https:", "agents-team", "agents-team"),
+    ).toBeNull();
+  });
+
+  it("returns null when the target slug is empty", () => {
+    expect(switchOrgUrl("a.example.com", "https:", "a", "")).toBeNull();
+  });
+
+  it("falls back to dropping the first label when currentSlug doesn't prefix the host", () => {
+    expect(switchOrgUrl("foo.example.com", "https:", "", "bar")).toBe(
+      "https://bar.example.com",
+    );
+  });
+
+  it("returns null when there is no apex to derive (single-label host)", () => {
+    expect(switchOrgUrl("localhost", "http:", "", "bar")).toBeNull();
+  });
+});
@@ -0,0 +1,23 @@
+// Org switching across tenant subdomains.
+//
+// Each org is its own tenant at <slug>.<apex> (e.g. agents-team.moleculesai.app),
+// so switching orgs from the canvas topbar means navigating to the target org's
+// subdomain. switchOrgUrl derives that URL from the current location, or returns
+// null when it's a no-op (same org / empty target) or the apex can't be resolved.
+
+export function switchOrgUrl(
+  hostname: string,
+  protocol: string,
+  currentSlug: string,
+  targetSlug: string,
+): string | null {
+  if (!targetSlug || targetSlug === currentSlug) return null;
+  // Prefer stripping the known current-org label; otherwise drop the first
+  // label as a best-effort apex (covers hosts we didn't seed a slug for).
+  const apex =
+    currentSlug && hostname.startsWith(`${currentSlug}.`)
+      ? hostname.slice(currentSlug.length + 1)
+      : hostname.split(".").slice(1).join(".");
+  if (!apex) return null;
+  return `${protocol}//${targetSlug}.${apex}`;
+}
@@ -76,7 +76,7 @@ fi
 log "Step 3 — Seed a file inside /workspace and ask agent to reference it"
 # Relies on /workspace being writable by the platform (we copy as root via
 # docker exec, mimicking the path a real agent would use through its tools).
-CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID:0:12}" | head -1)
+CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID}" | head -1)
 [ -n "$CONTAINER" ] || { echo "container not found"; exit 1; }
 docker exec "$CONTAINER" sh -c 'echo "E2E report body $(date -u +%s)" > /workspace/e2e-report.txt'

@@ -145,7 +145,7 @@ check_runtime() {
    fails=$((fails + 1)); return
  fi
  local container
-  container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid:0:12}" | head -1)
+  container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid}" | head -1)
  [ -z "$container" ] && { echo "FAIL $label: container not found"; fails=$((fails + 1)); return; }

  has_patch_in_container "$container" || { echo "FAIL $label: platform helpers missing"; fails=$((fails + 1)); return; }
@@ -94,8 +94,8 @@ check_contains "Upload child prompt" "replaced" "$CHILD_UPLOAD"

 # Verify prompts in containers
 sleep 2
-ROOT_CONTAINER=$(docker ps --filter "name=ws-${ROOT:0:12}" -q | head -1)
-CHILD_CONTAINER=$(docker ps --filter "name=ws-${CHILD:0:12}" -q | head -1)
+ROOT_CONTAINER=$(docker ps --filter "name=ws-${ROOT}" -q | head -1)
+CHILD_CONTAINER=$(docker ps --filter "name=ws-${CHILD}" -q | head -1)

 ROOT_HAS_PROMPT=$(docker exec $ROOT_CONTAINER cat /configs/system-prompt.md 2>/dev/null | head -1)
 check_contains "Root container has prompt" "Root Agent" "$ROOT_HAS_PROMPT"
@@ -153,19 +153,17 @@ RT_HM_ID=$(echo "$R" | jq_extract "['id']")

 # Wait for containers to start (poll up to 30s for first one to appear)
 if command -v docker &>/dev/null; then
-  short_cc="${RT_CC_ID:0:12}"
  for _ in 1 2 3 4 5 6; do
    sleep 5
-    if docker inspect "ws-${short_cc}" >/dev/null 2>&1; then break; fi
+    if docker inspect "ws-${RT_CC_ID}" >/dev/null 2>&1; then break; fi
  done

  _check_image() {
    local ws_id="$1" expected_tag="$2" label="$3"
-    local short_id="${ws_id:0:12}"
    # Poll up to 30s for image to appear
    local actual_image="NOT_FOUND"
    for _ in 1 2 3 4 5 6; do
-      actual_image=$(docker inspect "ws-${short_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "NOT_FOUND")
+      actual_image=$(docker inspect "ws-${ws_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "NOT_FOUND")
      if echo "$actual_image" | grep -qF "$expected_tag"; then break; fi
      sleep 5
    done
@@ -216,10 +214,9 @@ if echo "$R" | grep -qF "saved"; then
  curl -s -X POST "$BASE/workspaces/$RT_CX_ID/restart" > /dev/null 2>&1
  # Poll up to 30s for the new container image to appear (restart can take a while)
  if command -v docker &>/dev/null; then
-    short_id="${RT_CX_ID:0:12}"
    for _ in 1 2 3 4 5 6; do
      sleep 5
-      actual=$(docker inspect "ws-${short_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "")
+      actual=$(docker inspect "ws-${RT_CX_ID}" --format '{{.Config.Image}}' 2>/dev/null || echo "")
      if echo "$actual" | grep -qF "openclaw"; then break; fi
    done
    _check_image "$RT_CX_ID" "openclaw" "Runtime change codex to openclaw on restart"
@@ -191,8 +191,29 @@ except Exception:
 }

 container_running() {  # container_running <ws-id>  -> echoes name if running
-  local short="${1:0:12}"
-  docker ps --filter "name=ws-${short}" --filter "status=running" --format '{{.Names}}' 2>/dev/null | head -1
+  docker ps --filter "name=ws-${1}" --filter "status=running" --format '{{.Names}}' 2>/dev/null | head -1
+}
+
+diagnose_provision() {
+  local wsid="${1:-}"
+  local container
+  container=$(container_running "$wsid")
+  echo "--- DIAGNOSE provisioning for $wsid ---"
+  echo "last_sample_error: ${LAST:-<none>}"
+  echo "container_running: ${container:-<none>}"
+  if [ -n "$container" ]; then
+    echo "--- container logs ($container) ---"
+    docker logs "$container" 2>&1 | tail -n 60 || true
+    echo "--- container env ---"
+    docker inspect "$container" --format '{{json .Config.Env}}' 2>&1 || true
+    echo "--- container reachability test ---"
+    docker exec "$container" sh -c 'echo "platform_url=$PLATFORM_URL"; wget -qO- "$PLATFORM_URL/health" 2>&1 || true' || true
+  fi
+  echo "--- all ws-* containers ---"
+  docker ps --filter "name=ws-" --format '{{.Names}} {{.Status}}' 2>/dev/null || true
+  echo "--- all ws-* volumes ---"
+  docker volume ls -q 2>/dev/null | grep '^ws-' || true
+  echo "--- end diagnose ---"
 }

 cleanup() {
@@ -203,16 +224,11 @@ cleanup() {
    # SCOPED teardown — only the workspace this test created. Never a blanket
    # sweep (other dev workspaces may be live on this shared daemon).
    e2e_delete_workspace "$WSID" "" >/dev/null 2>&1 || true
-    local short="${WSID:0:12}"
-    docker rm -f "ws-${short}" >/dev/null 2>&1 || true
-    # Volume naming is split in the provisioner: configs + claude-sessions use the
-    # 12-char short id (ConfigVolumeName/ClaudeSessionVolumeName), but the
-    # /workspace volume uses the FULL UUID (buildWorkspaceMount: ws-<id>-workspace).
-    # Remove BOTH forms so neither leaks.
+    docker rm -f "ws-${WSID}" >/dev/null 2>&1 || true
    docker volume rm -f \
-      "ws-${short}-configs" "ws-${short}-claude-sessions" \
-      "ws-${short}-workspace" "ws-${WSID}-workspace" >/dev/null 2>&1 || true
-    echo "cleaned workspace $WSID + ws-${short} container/volumes"
+      "ws-${WSID}-configs" "ws-${WSID}-claude-sessions" \
+      "ws-${WSID}-workspace" >/dev/null 2>&1 || true
+    echo "cleaned workspace $WSID + ws-${WSID} container/volumes"
  fi
  # Restore the cache tag to whatever it pointed at before we retagged it, so a
  # stub run doesn't leave the real claude-code tag aliased to the stub.
@@ -331,8 +347,7 @@ if [ -z "$WSID" ]; then
  exit 1
 fi
 pass "workspace created: $WSID"
-SHORT="${WSID:0:12}"
-CONFIG_VOL="ws-${SHORT}-configs"
+CONFIG_VOL="ws-${WSID}-configs"

 # Mint a workspace bearer for the WorkspaceAuth-gated secret + /restart calls.
 WTOKEN=$(e2e_mint_workspace_token "$WSID" || true)
@@ -436,8 +451,9 @@ for _ in $(seq 1 "$ONLINE_TIMEOUT"); do
  sleep 1
 done
 check "workspace reached online (status=$STATUS)" "online" "$STATUS"
+if [ "$FAIL" -gt 0 ]; then diagnose_provision "$WSID"; echo "=== Results: $PASS passed, $FAIL failed ==="; exit 1; fi
 RUN=$(container_running "$WSID")
-if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID:0:12} container" "docker ps shows none"; fi
+if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container" "docker ps shows none"; fi
 echo ""

 # ----------------------------------------------------------------------------
@@ -473,6 +489,7 @@ else
    sleep 1
  done
  check "workspace back online after restart (status=$STATUS)" "online" "$STATUS"
+  if [ "$FAIL" -gt 0 ]; then diagnose_provision "$WSID"; echo "=== Results: $PASS passed, $FAIL failed ==="; exit 1; fi
  # Explicit negative on the exact bug signature.
  if echo "$LAST" | grep -qiF "config volume is empty"; then
    fail "restart hit 'config volume is empty' — restart-survival REGRESSION" "$LAST"
@@ -107,7 +107,6 @@ func (h *MemoriesHandler) withMemoryV2APIs(plugin memoryPluginAPI, resolver name
 	return h
 }

-
 // Commit handles POST /workspaces/:id/memories
 // Stores a memory fact with a scope (LOCAL, TEAM, GLOBAL) and an optional
 // namespace (defaults to "general"). Namespaces implement the Holaboss
@@ -213,6 +212,26 @@ func (h *MemoriesHandler) Commit(c *gin.Context) {
 		return
 	}

+	// Ensure the namespace row exists before the write — the plugin's
+	// CommitMemory contract is "namespace must already exist (auto-created
+	// by handler if not)" and memory_records carries an FK to
+	// memory_namespaces. The MCP tool path (mcp_tools_memory_v2.go) has
+	// always done this upsert; this HTTP path skipped it, so any workspace
+	// whose namespace row was never seeded (every workspace created after
+	// the Phase A2 backfill that only ever wrote through this surface —
+	// i.e. the runtime a2a commit_memory tool and the canvas) failed every
+	// write with `memory_records_namespace_fkey` (fleet-wide, 2026-06-10).
+	// UpsertNamespace is idempotent, so the extra round-trip on warm
+	// namespaces is a cheap no-op.
+	if _, err := h.memv2.plugin.UpsertNamespace(ctx, nsName, contract.NamespaceUpsert{Kind: kindFromNamespace(nsName)}); err != nil {
+		log.Printf(
+			"Commit memory namespace upsert error: workspace=%s scope=%s namespace=%s err_class=%T err=%q",
+			workspaceID, body.Scope, nsName, err, err,
+		)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to store memory"})
+		return
+	}
+
 	// Plugin write. The plugin owns its own embedding generation (FTS
 	// + vector indices are internal to memory_plugin schema), so we no
 	// longer call h.embed here — that becomes dead weight on this path
@@ -11,9 +11,9 @@ import (
 	"testing"
 	"time"

-	"github.com/DATA-DOG/go-sqlmock"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/memory/contract"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/memory/namespace"
+	"github.com/DATA-DOG/go-sqlmock"
 	"github.com/gin-gonic/gin"
 )

@@ -97,6 +97,103 @@ func TestMemoriesCommit_Local_Success(t *testing.T) {
 	}
 }

+// TestMemoriesCommit_UpsertsNamespaceBeforeWrite pins the fleet-wide
+// 2026-06-10 regression: the HTTP Commit path went straight to
+// plugin.CommitMemory without ensuring the namespace row exists, so any
+// workspace whose memory_namespaces row was never seeded (everything
+// created after the Phase A2 backfill that only wrote through this
+// surface — the runtime a2a commit_memory tool and the canvas) failed
+// every write with memory_records_namespace_fkey. The MCP tool path has
+// always upserted first; this asserts the HTTP path does too, and in
+// the right order.
+//
+// MUTATION: drop the UpsertNamespace call in MemoriesHandler.Commit →
+// calls slice misses "upsert" → RED (the exact production failure).
+func TestMemoriesCommit_UpsertsNamespaceBeforeWrite(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	var calls []string
+	var upsertNS string
+	var upsertKind contract.NamespaceKind
+	plugin := &stubMemoryPlugin{
+		upsertFn: func(_ context.Context, name string, body contract.NamespaceUpsert) (*contract.Namespace, error) {
+			calls = append(calls, "upsert")
+			upsertNS = name
+			upsertKind = body.Kind
+			return &contract.Namespace{Name: name, Kind: body.Kind}, nil
+		},
+		commitFn: func(_ context.Context, ns string, body contract.MemoryWrite) (*contract.MemoryWriteResponse, error) {
+			calls = append(calls, "commit")
+			return &contract.MemoryWriteResponse{ID: "mem-up-1", Namespace: ns}, nil
+		},
+	}
+	handler := NewMemoriesHandler().withMemoryV2APIs(
+		plugin,
+		memCommitResolver("ws-up", contract.NamespaceKindWorkspace),
+	)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-up"}}
+	c.Request = httptest.NewRequest("POST", "/", bytes.NewBufferString(`{"content":"first ever memory","scope":"LOCAL"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Commit(c)
+
+	if w.Code != http.StatusCreated {
+		t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String())
+	}
+	if len(calls) != 2 || calls[0] != "upsert" || calls[1] != "commit" {
+		t.Errorf("namespace must be upserted BEFORE the write, got call order %v", calls)
+	}
+	if upsertNS != "workspace:ws-up" {
+		t.Errorf("upsert namespace = %q, want workspace:ws-up", upsertNS)
+	}
+	if upsertKind != contract.NamespaceKindWorkspace {
+		t.Errorf("upsert kind = %q, want %q", upsertKind, contract.NamespaceKindWorkspace)
+	}
+}
+
+// TestMemoriesCommit_UpsertError_500 — an upsert failure must surface as
+// the same stable generic 500 (no plugin internals leaked) and must NOT
+// proceed to the write.
+func TestMemoriesCommit_UpsertError_500(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	commitCalled := false
+	plugin := &stubMemoryPlugin{
+		upsertFn: func(_ context.Context, _ string, _ contract.NamespaceUpsert) (*contract.Namespace, error) {
+			return nil, errors.New("plugin down")
+		},
+		commitFn: func(_ context.Context, ns string, _ contract.MemoryWrite) (*contract.MemoryWriteResponse, error) {
+			commitCalled = true
+			return &contract.MemoryWriteResponse{ID: "nope", Namespace: ns}, nil
+		},
+	}
+	handler := NewMemoriesHandler().withMemoryV2APIs(
+		plugin,
+		memCommitResolver("ws-up2", contract.NamespaceKindWorkspace),
+	)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-up2"}}
+	c.Request = httptest.NewRequest("POST", "/", bytes.NewBufferString(`{"content":"x","scope":"LOCAL"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Commit(c)
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("expected 500 on upsert failure, got %d: %s", w.Code, w.Body.String())
+	}
+	if !bytes.Contains(w.Body.Bytes(), []byte("failed to store memory")) {
+		t.Errorf("error body must stay the stable generic message, got %s", w.Body.String())
+	}
+	if commitCalled {
+		t.Error("CommitMemory must not run when the namespace upsert failed")
+	}
+}
+
 func TestMemoriesCommit_Global_AsRoot(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
@@ -663,6 +760,7 @@ func TestCommitMemory_LocalScope_NoDelimiterEscape(t *testing.T) {
 		t.Errorf("LOCAL memory content should be stored verbatim, got %q (expected %q)", cap.Body.Content, content)
 	}
 }
+
 // ---------- MemoriesHandler: Update (PATCH) ----------
 //
 // Pin the full Update flow: namespace-only edit, content edit (LOCAL),
@@ -681,4 +779,4 @@ func TestCommitMemory_LocalScope_NoDelimiterEscape(t *testing.T) {
 // Caller passes content + namespace identical to existing values:
 // post-normalisation nothing changed. Return 200 with changed=false,
 // no UPDATE, no audit row. Saves a round-trip + an audit-log entry on
-// idempotent re-edits (e.g. user clicks Save without changing fields).
+// idempotent re-edits (e.g. user clicks Save without changing fields).
@@ -33,6 +33,7 @@ import (

 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
 	"github.com/gin-gonic/gin"
 	"github.com/google/uuid"
 )
@@ -239,7 +240,7 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig(
 		}
 	}
 	if len(base) == 0 && h.provisioner != nil {
-		if b, err := h.provisioner.ExecRead(ctx, configDirName(workspaceID), "/configs/config.yaml"); err == nil {
+		if b, err := h.provisioner.ExecRead(ctx, provisioner.ContainerName(workspaceID), "/configs/config.yaml"); err == nil {
 			base = b
 		}
 	}
@@ -399,7 +400,7 @@ func conciergeIdentityPresent(ctx context.Context, prov localProvisionerIsRunnin
 		// that doesn't expose ExecRead.
 		return true
 	}
-	body, err := reader.ExecRead(ctx, configDirName(id), "/configs/system-prompt.md")
+	body, err := reader.ExecRead(ctx, provisioner.ContainerName(id), "/configs/system-prompt.md")
 	if err != nil {
 		return false
 	}
@@ -31,28 +31,72 @@ type workspaceDisplayResponse struct {
 	Status    string `json:"status,omitempty"`
 }

-// workspaceComputeInstanceAllowlist is keyed by cloud provider (multi-provider /
-// in-place switch): each provider's box accepts only that provider's machine
-// sizes (an AWS t3.* is meaningless on Hetzner, and vice-versa). Mirrors the CP
-// provider SSOT — keep in lock-step with the controlplane provider configs
-// (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2 t3*/m6i*/c6i*).
-// TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider pin the sets.
-// "" provider = AWS default.
-var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{
+// SSOT for cloud-provider + instance-type metadata (core#2489).
+//
+// This file is the SINGLE source of truth the workspace-server validates
+// against AND the canvas Container-Config tab renders its dropdowns from (via
+// GET /workspaces/:id/compute-options, see ComputeOptions below). Previously the
+// canvas hardcoded a parallel copy of these lists in ContainerConfigTab.tsx; the
+// two could drift so the UI offered a (provider, instance-type) the backend
+// allowlist then rejected with a 400. The canvas now derives its options from
+// this endpoint, so drift is impossible by construction.
+//
+// The ordered slices below are the canonical form. workspaceComputeInstanceAllowlist
+// (the O(1) validation set) is DERIVED from them in init(), so the ordered list
+// the canvas renders and the set the backend validates can never disagree.
+//
+// Mirrors the CP provider SSOT — keep in lock-step with the controlplane provider
+// configs (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2
+// t3*/m6i*/c6i*). TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider
+// pin the sets. "" provider = AWS default.
+
+// workspaceComputeProvidersOrdered is the canonical provider order (AWS first =
+// default). The canvas renders the provider dropdown in this order.
+var workspaceComputeProvidersOrdered = []string{"aws", "hetzner", "gcp"}
+
+// workspaceComputeInstanceTypesOrdered lists each provider's machine sizes in the
+// order the canvas should render them. An AWS t3.* is meaningless on Hetzner, and
+// vice-versa, so the set is provider-scoped.
+var workspaceComputeInstanceTypesOrdered = map[string][]string{
 	"aws": {
-		"t3.medium": {}, "t3.large": {}, "t3.xlarge": {}, "t3.2xlarge": {},
-		"m6i.large": {}, "m6i.xlarge": {}, "c6i.xlarge": {},
+		"t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge",
+		"m6i.large", "m6i.xlarge", "c6i.xlarge",
 	},
 	"hetzner": {
-		"cpx11": {}, "cpx21": {}, "cpx31": {}, "cpx41": {}, "cpx51": {},
-		"cax11": {}, "cax21": {}, "cax31": {}, "cax41": {},
+		"cpx11", "cpx21", "cpx31", "cpx41", "cpx51",
+		"cax11", "cax21", "cax31", "cax41",
 	},
 	"gcp": {
-		"e2-small": {}, "e2-medium": {},
-		"e2-standard-2": {}, "e2-standard-4": {}, "e2-standard-8": {},
+		"e2-small", "e2-medium",
+		"e2-standard-2", "e2-standard-4", "e2-standard-8",
 	},
 }

+// workspaceComputeDefaultInstanceByProvider is the per-provider default machine
+// size the canvas pre-selects when switching providers (an AWS t3.* is invalid on
+// Hetzner, so the switch resets to the new provider's default).
+var workspaceComputeDefaultInstanceByProvider = map[string]string{
+	"aws":     "t3.medium",
+	"hetzner": "cpx31",
+	"gcp":     "e2-standard-2",
+}
+
+// workspaceComputeInstanceAllowlist is the O(1) validation set, keyed by cloud
+// provider. DERIVED from workspaceComputeInstanceTypesOrdered in init() so the
+// ordered list (what the canvas renders) and the set (what the backend validates)
+// stay in lock-step — you cannot add an instance type to one without the other.
+var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{}
+
+func init() {
+	for provider, types := range workspaceComputeInstanceTypesOrdered {
+		set := make(map[string]struct{}, len(types))
+		for _, t := range types {
+			set[t] = struct{}{}
+		}
+		workspaceComputeInstanceAllowlist[provider] = set
+	}
+}
+
 // normalizeCloudProvider maps "" → "aws" so the in-place switch comparison
 // treats the default and an explicit "aws" as the same cloud (no spurious switch).
 func normalizeCloudProvider(p string) string {
@@ -88,10 +132,15 @@ func instanceTypeAllowedForProvider(provider, instanceType string) bool {
 // change here (and the CP itself fail-closes an unwired provider with a 422).
 // "" = default (AWS) and is always accepted. This is the gate the switch-provider
 // flow reuses to reject a bad provider with a clean 400 before any CP round-trip.
-var workspaceComputeProviderAllowlist = map[string]struct{}{
-	"aws":     {},
-	"gcp":     {},
-	"hetzner": {},
+// DERIVED from workspaceComputeProvidersOrdered (the SSOT, core#2489) in init() so
+// the set the backend validates and the ordered list the canvas renders cannot
+// drift.
+var workspaceComputeProviderAllowlist = map[string]struct{}{}
+
+func init() {
+	for _, p := range workspaceComputeProvidersOrdered {
+		workspaceComputeProviderAllowlist[p] = struct{}{}
+	}
 }

 func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
@@ -262,6 +311,55 @@ func withStoredCompute(ctx context.Context, workspaceID string, payload models.C
 	return payload
 }

+// workspaceComputeOptionsResponse is the SSOT payload the canvas Container-Config
+// tab consumes to populate its provider + instance-type dropdowns (core#2489).
+// It is derived entirely from the allowlist + defaults in this file, so the UI
+// can never offer a (provider, instance-type) the backend then rejects.
+type workspaceComputeOptionsResponse struct {
+	// Providers in canonical render order (AWS first = default).
+	Providers []string `json:"providers"`
+	// InstanceTypes per provider, in canonical render order.
+	InstanceTypes map[string][]string `json:"instanceTypes"`
+	// Defaults maps each provider → its default instance type (the canvas
+	// pre-selects this when switching providers).
+	Defaults map[string]string `json:"defaults"`
+}
+
+// buildComputeOptions assembles the SSOT response from the allowlist + defaults.
+// Pure (no DB / no gin) so it can be unit-tested directly and reused.
+func buildComputeOptions() workspaceComputeOptionsResponse {
+	providers := make([]string, len(workspaceComputeProvidersOrdered))
+	copy(providers, workspaceComputeProvidersOrdered)
+
+	instanceTypes := make(map[string][]string, len(workspaceComputeInstanceTypesOrdered))
+	for _, p := range providers {
+		src := workspaceComputeInstanceTypesOrdered[p]
+		dst := make([]string, len(src))
+		copy(dst, src)
+		instanceTypes[p] = dst
+	}
+
+	defaults := make(map[string]string, len(workspaceComputeDefaultInstanceByProvider))
+	for k, v := range workspaceComputeDefaultInstanceByProvider {
+		defaults[k] = v
+	}
+
+	return workspaceComputeOptionsResponse{
+		Providers:     providers,
+		InstanceTypes: instanceTypes,
+		Defaults:      defaults,
+	}
+}
+
+// ComputeOptions handles GET /workspaces/:id/compute-options. It returns the
+// cloud-provider + instance-type metadata the canvas Container-Config tab renders
+// its dropdowns from — the SAME data validateWorkspaceCompute enforces (core#2489).
+// Static (derived from the in-binary allowlist), so it needs no DB round-trip; the
+// :id is scoped only by the WorkspaceAuth middleware on the route group.
+func (h *WorkspaceHandler) ComputeOptions(c *gin.Context) {
+	c.JSON(200, buildComputeOptions())
+}
+
 // Display handles GET /workspaces/:id/display.
 func (h *WorkspaceHandler) Display(c *gin.Context) {
 	workspaceID := c.Param("id")
@@ -375,6 +375,103 @@ func TestWithStoredCompute_LoadsComputeForRestartPayloads(t *testing.T) {
 	}
 }

+// core#2489: the allowlist (validation set) MUST be derived from the ordered
+// lists the canvas renders, so the UI and the backend can never disagree about
+// which (provider, instance-type) pairs are valid. This pins that the derived
+// set exactly matches the ordered source — adding to one without the other fails.
+func TestComputeOptions_AllowlistDerivedFromOrderedSSOT(t *testing.T) {
+	// Every ordered instance type is in the validation set (and vice-versa).
+	for provider, types := range workspaceComputeInstanceTypesOrdered {
+		set, ok := workspaceComputeInstanceAllowlist[provider]
+		if !ok {
+			t.Fatalf("allowlist missing provider %q present in ordered SSOT", provider)
+		}
+		if len(set) != len(types) {
+			t.Fatalf("provider %q: ordered list (%d) and allowlist set (%d) drifted", provider, len(types), len(set))
+		}
+		for _, it := range types {
+			if _, ok := set[it]; !ok {
+				t.Fatalf("provider %q: ordered instance %q missing from validation allowlist", provider, it)
+			}
+		}
+	}
+	// No extra providers in the set that aren't in the ordered list.
+	if len(workspaceComputeInstanceAllowlist) != len(workspaceComputeInstanceTypesOrdered) {
+		t.Fatalf("allowlist has providers not present in the ordered SSOT")
+	}
+	// Provider allowlist derived from the ordered providers.
+	if len(workspaceComputeProviderAllowlist) != len(workspaceComputeProvidersOrdered) {
+		t.Fatalf("provider allowlist (%d) drifted from ordered providers (%d)", len(workspaceComputeProviderAllowlist), len(workspaceComputeProvidersOrdered))
+	}
+	for _, p := range workspaceComputeProvidersOrdered {
+		if _, ok := workspaceComputeProviderAllowlist[p]; !ok {
+			t.Fatalf("provider allowlist missing ordered provider %q", p)
+		}
+	}
+}
+
+// core#2489: the per-provider defaults the canvas pre-selects on a provider switch
+// MUST themselves be valid instance types for that provider — otherwise the switch
+// produces a PATCH the backend immediately rejects.
+func TestComputeOptions_DefaultsAreValidForTheirProvider(t *testing.T) {
+	for provider, def := range workspaceComputeDefaultInstanceByProvider {
+		if !instanceTypeAllowedForProvider(provider, def) {
+			t.Errorf("default instance %q for provider %q is not in that provider's allowlist", def, provider)
+		}
+	}
+	// Every provider must have a default (so the switch never lands on "").
+	for _, p := range workspaceComputeProvidersOrdered {
+		if workspaceComputeDefaultInstanceByProvider[p] == "" {
+			t.Errorf("provider %q has no default instance type", p)
+		}
+	}
+}
+
+// core#2489: the GET /compute-options endpoint returns exactly the SSOT data the
+// canvas renders dropdowns from. Every (provider, instance-type) it advertises
+// MUST pass validateWorkspaceCompute — the whole point of the consolidation.
+func TestWorkspaceComputeOptions_ReturnsSSOTAndEveryOptionValidates(t *testing.T) {
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-opts"}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-opts/compute-options", nil)
+
+	handler.ComputeOptions(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	var resp workspaceComputeOptionsResponse
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("failed to parse compute-options response: %v", err)
+	}
+
+	// AWS first (default) in the provider order.
+	if len(resp.Providers) == 0 || resp.Providers[0] != "aws" {
+		t.Fatalf("providers = %v, want aws first", resp.Providers)
+	}
+	// Every advertised (provider, instance-type) must pass backend validation.
+	for _, provider := range resp.Providers {
+		types, ok := resp.InstanceTypes[provider]
+		if !ok || len(types) == 0 {
+			t.Fatalf("compute-options advertised provider %q with no instance types", provider)
+		}
+		for _, it := range types {
+			if !instanceTypeAllowedForProvider(provider, it) {
+				t.Errorf("compute-options advertised %q/%q which the backend rejects (DRIFT)", provider, it)
+			}
+		}
+		def := resp.Defaults[provider]
+		if def == "" {
+			t.Errorf("compute-options missing default for provider %q", provider)
+		} else if !instanceTypeAllowedForProvider(provider, def) {
+			t.Errorf("compute-options default %q for %q fails backend validation", def, provider)
+		}
+	}
+}
+
 func TestWorkspaceDisplay_NonDisplayWorkspaceReturnsUnavailable(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
@@ -15,6 +15,7 @@ import (
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/events"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provlog"
 	"github.com/gin-gonic/gin"
 )
@@ -393,7 +394,7 @@ func (h *WorkspaceHandler) restartRuntimeFromConfig(ctx context.Context, id, wsN
 		return dbRuntime
 	}
 	containerRuntime := dbRuntime
-	containerName := configDirName(id) // ws-{id[:12]}
+	containerName := provisioner.ContainerName(id) // ws-{id} (KI-013 full UUID)
 	if cfgBytes, readErr := h.provisioner.ExecRead(ctx, containerName, "/configs/config.yaml"); readErr == nil {
 		for _, line := range strings.Split(string(cfgBytes), "\n") {
 			line = strings.TrimSpace(line)
@@ -227,3 +227,58 @@ func TestCacheKey_SlugSeparator(t *testing.T) {
 		t.Errorf("cacheKey collides on ambiguous splits")
 	}
 }
+
+func TestTenantSlug(t *testing.T) {
+	t.Setenv("MOLECULE_ORG_SLUG", "acme-corp")
+	if got := tenantSlug(); got != "acme-corp" {
+		t.Errorf("tenantSlug() = %q, want %q", got, "acme-corp")
+	}
+}
+
+func TestTenantSlug_TrimSpace(t *testing.T) {
+	t.Setenv("MOLECULE_ORG_SLUG", "  spaced-slug  ")
+	if got := tenantSlug(); got != "spaced-slug" {
+		t.Errorf("tenantSlug() = %q, want %q", got, "spaced-slug")
+	}
+}
+
+func TestTenantSlug_Empty(t *testing.T) {
+	t.Setenv("MOLECULE_ORG_SLUG", "")
+	if got := tenantSlug(); got != "" {
+		t.Errorf("tenantSlug() = %q, want empty", got)
+	}
+}
+
+func TestCPSessionVerifyURL(t *testing.T) {
+	t.Setenv("CP_UPSTREAM_URL", "https://cp.test")
+	got := cpSessionVerifyURL("acme")
+	want := "https://cp.test/cp/auth/tenant-member?slug=acme"
+	if got != want {
+		t.Errorf("cpSessionVerifyURL() = %q, want %q", got, want)
+	}
+}
+
+func TestCPSessionVerifyURL_TrailingSlash(t *testing.T) {
+	t.Setenv("CP_UPSTREAM_URL", "https://cp.test/")
+	got := cpSessionVerifyURL("acme")
+	want := "https://cp.test/cp/auth/tenant-member?slug=acme"
+	if got != want {
+		t.Errorf("cpSessionVerifyURL() = %q, want %q", got, want)
+	}
+}
+
+func TestCPSessionVerifyURL_EscapeSlug(t *testing.T) {
+	t.Setenv("CP_UPSTREAM_URL", "https://cp.test")
+	got := cpSessionVerifyURL("acme corp")
+	want := "https://cp.test/cp/auth/tenant-member?slug=acme+corp"
+	if got != want {
+		t.Errorf("cpSessionVerifyURL() = %q, want %q", got, want)
+	}
+}
+
+func TestCPSessionVerifyURL_NoCPConfigured(t *testing.T) {
+	t.Setenv("CP_UPSTREAM_URL", "")
+	if got := cpSessionVerifyURL("acme"); got != "" {
+		t.Errorf("cpSessionVerifyURL() = %q, want empty when CP_UPSTREAM_URL unset", got)
+	}
+}
@@ -168,8 +168,17 @@ type cpProvisionRequest struct {
 	// DataPersistence is the per-workspace durable-data choice (internal#734);
 	// CP validates the enum at its provision edge and resolves the data volume
 	// from it. Empty = auto (omitted on the wire).
-	DataPersistence string                 `json:"data_persistence,omitempty"`
-	Display         WorkspaceDisplayConfig `json:"display,omitempty"`
+	DataPersistence string `json:"data_persistence,omitempty"`
+	// Kind forwards the workspace kind ("" / "workspace" ordinary, "platform"
+	// = the org concierge) so the CP can select the platform-agent image
+	// variant — the SaaS mirror of the local Docker provisioner's kind-driven
+	// image preference (RFC docs/design/rfc-platform-agent.md; core#2495 SSOT:
+	// the concierge is a normal workspace provisioned through this same path,
+	// differing ONLY in image + config overlay). Omitted when empty so the
+	// wire shape is unchanged for ordinary workspaces; an older CP simply
+	// ignores the field.
+	Kind    string                 `json:"kind,omitempty"`
+	Display WorkspaceDisplayConfig `json:"display,omitempty"`
 	PlatformURL     string                 `json:"platform_url"`
 	Env             map[string]string      `json:"env"`
 	// ConfigFiles are template + generated config files to write into the
@@ -262,6 +271,7 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
 		DiskGB:          cfg.DiskGB,
 		DataPersistence: cfg.DataPersistence,
 		Provider:        cfg.Provider,
+		Kind:            cfg.Kind,
 		Display:         cfg.Display,
 		PlatformURL:     cfg.PlatformURL,
 		Env:             env,
@@ -0,0 +1,77 @@
+package provisioner
+
+// cp_provisioner_kind_test.go — pins the kind passthrough on the CP provision
+// wire (core#2495 SSOT): a kind='platform' workspace (the org concierge) is
+// provisioned through this SAME path as every ordinary workspace, differing
+// only in the image the CP selects — which requires the CP to KNOW the kind.
+// Before this field, the CP picked the plain runtime image, the platform MCP
+// binary was absent, and the concierge hard-failed its MCP readiness gate.
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+// startCaptureCP spins a fake CP that captures the provision request body and
+// returns a minimal 201. Returns the provisioner wired at it + the body ptr.
+func startCaptureCP(t *testing.T) (*CPProvisioner, *[]byte) {
+	t.Helper()
+	var body []byte
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		b, _ := io.ReadAll(r.Body)
+		body = b
+		w.WriteHeader(http.StatusCreated)
+		_, _ = w.Write([]byte(`{"instance_id":"i-test","private_ip":"10.0.0.9","state":"pending"}`))
+	}))
+	t.Cleanup(srv.Close)
+	return &CPProvisioner{
+		baseURL:      srv.URL,
+		orgID:        "org-1",
+		sharedSecret: "s3cret",
+		adminToken:   "tok-xyz",
+		httpClient:   srv.Client(),
+	}, &body
+}
+
+// The concierge: kind='platform' must reach the CP verbatim.
+func TestStart_ForwardsPlatformKind(t *testing.T) {
+	p, body := startCaptureCP(t)
+	_, err := p.Start(context.Background(), WorkspaceConfig{
+		WorkspaceID: "ws-concierge",
+		Runtime:     "claude-code",
+		Kind:        WorkspaceKindPlatform,
+		PlatformURL: "https://acme.example.com",
+	})
+	if err != nil {
+		t.Fatalf("Start: %v", err)
+	}
+	var req map[string]any
+	if err := json.Unmarshal(*body, &req); err != nil {
+		t.Fatalf("unmarshal captured body: %v", err)
+	}
+	if got := req["kind"]; got != "platform" {
+		t.Errorf("kind on the CP wire = %v, want \"platform\" — without it the CP picks the plain runtime image and the concierge loses its platform MCP (core#2495)", got)
+	}
+}
+
+// Ordinary workspaces: the wire shape must be UNCHANGED (omitempty) so older
+// CPs see byte-identical requests.
+func TestStart_OmitsKindForOrdinaryWorkspace(t *testing.T) {
+	p, body := startCaptureCP(t)
+	_, err := p.Start(context.Background(), WorkspaceConfig{
+		WorkspaceID: "ws-ordinary",
+		Runtime:     "claude-code",
+		PlatformURL: "https://acme.example.com",
+	})
+	if err != nil {
+		t.Fatalf("Start: %v", err)
+	}
+	if strings.Contains(string(*body), `"kind"`) {
+		t.Errorf("ordinary workspace provision body must omit the kind field (omitempty contract), got: %s", string(*body))
+	}
+}
@@ -253,7 +253,7 @@ func TestStart_SendsTemplateAndGeneratedConfigFiles(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(tmpl, "config.yaml"), []byte("name: template\n"), 0o600); err != nil {
 		t.Fatal(err)
 	}
-	if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes), 0o600); err != nil {
+	if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes-100), 0o600); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.Mkdir(filepath.Join(tmpl, "prompts"), 0o700); err != nil {
@@ -378,7 +378,7 @@ func TestStart_CollectsConfigFiles(t *testing.T) {
 	}
 	// adapter.py is within the size limit but is NOT config.yaml or prompts/,
 	// so isCPTemplateConfigFile must exclude it from the transport.
-	if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes), 0o600); err != nil {
+	if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes-100), 0o600); err != nil {
 		t.Fatal(err)
 	}

@@ -19,6 +19,7 @@ import (
 	"sync"
 	"time"

+	"github.com/docker/docker/api/types"
 	"github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/api/types/filters"
 	dockerimage "github.com/docker/docker/api/types/image"
@@ -198,6 +199,11 @@ const (

 // ConfigVolumeName returns the Docker named volume for a workspace's configs.
 func ConfigVolumeName(workspaceID string) string {
+	return fmt.Sprintf("ws-%s-configs", workspaceID)
+}
+
+// legacyConfigVolumeName returns the pre-KI-013 truncated config volume name.
+func legacyConfigVolumeName(workspaceID string) string {
 	id := workspaceID
 	if len(id) > 12 {
 		id = id[:12]
@@ -210,6 +216,11 @@ func ConfigVolumeName(workspaceID string) string {
 // config volume so it can be discarded independently (via WORKSPACE_RESET_SESSION
 // or ?reset=true) without wiping the user's config. Issue #12.
 func ClaudeSessionVolumeName(workspaceID string) string {
+	return fmt.Sprintf("ws-%s-claude-sessions", workspaceID)
+}
+
+// legacyClaudeSessionVolumeName returns the pre-KI-013 truncated session volume name.
+func legacyClaudeSessionVolumeName(workspaceID string) string {
 	id := workspaceID
 	if len(id) > 12 {
 		id = id[:12]
@@ -217,9 +228,31 @@ func ClaudeSessionVolumeName(workspaceID string) string {
 	return fmt.Sprintf("ws-%s-claude-sessions", id)
 }

+// dockerClient is the subset of client.Client methods used by Provisioner.
+// Declared as an interface so tests can inject fakes without a real daemon.
+type dockerClient interface {
+	Close() error
+	ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error)
+	ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error)
+	ContainerExecCreate(ctx context.Context, container string, config container.ExecOptions) (container.ExecCreateResponse, error)
+	ContainerInspect(ctx context.Context, container string) (container.InspectResponse, error)
+	ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error)
+	ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error)
+	ContainerRemove(ctx context.Context, container string, options container.RemoveOptions) error
+	ContainerRename(ctx context.Context, container, newContainerName string) error
+	ContainerStart(ctx context.Context, container string, options container.StartOptions) error
+	ContainerWait(ctx context.Context, container string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error)
+	CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error
+	ImageInspect(ctx context.Context, image string, opts ...client.ImageInspectOption) (dockerimage.InspectResponse, error)
+	ImagePull(ctx context.Context, ref string, opts dockerimage.PullOptions) (io.ReadCloser, error)
+	VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error)
+	VolumeInspect(ctx context.Context, volumeID string) (volume.Volume, error)
+	VolumeRemove(ctx context.Context, volumeID string, force bool) error
+}
+
 // Provisioner manages Docker containers for workspace agents.
 type Provisioner struct {
-	cli *client.Client
+	cli dockerClient
 }

 // New creates a new Provisioner connected to the local Docker daemon.
@@ -233,6 +266,12 @@ func New() (*Provisioner, error) {

 // ContainerName returns the Docker container name for a workspace.
 func ContainerName(workspaceID string) string {
+	return fmt.Sprintf("ws-%s", workspaceID)
+}
+
+// legacyContainerName returns the pre-KI-013 truncated container name.
+// Used only for backward-compatible lookups during the deploy transition.
+func legacyContainerName(workspaceID string) string {
 	id := workspaceID
 	if len(id) > 12 {
 		id = id[:12]
@@ -474,7 +513,9 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
 		return "", ErrNoBackend
 	}
 	name := ContainerName(cfg.WorkspaceID)
-	configVolume := ConfigVolumeName(cfg.WorkspaceID)
+	// KI-013 deploy safety: prefer legacy truncated config volume if it
+	// already exists, so pre-deploy workspace data is not orphaned.
+	configVolume := p.resolveConfigVolumeName(ctx, cfg.WorkspaceID)

 	// Create named volume for configs (idempotent — no-op if already exists)
 	_, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{
@@ -569,7 +610,9 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
 	// remove the existing volume before recreating it, so the agent
 	// boots with a clean session dir.
 	if cfg.Runtime == "claude-code" {
-		claudeSessionsVolume := ClaudeSessionVolumeName(cfg.WorkspaceID)
+		// KI-013 deploy safety: prefer legacy truncated session volume if it
+		// already exists, so pre-deploy session data is not orphaned.
+		claudeSessionsVolume := p.resolveClaudeSessionVolumeName(ctx, cfg.WorkspaceID)
 		resetEnv, _ := strconv.ParseBool(cfg.EnvVars["WORKSPACE_RESET_SESSION"])
 		if cfg.ResetClaudeSession || resetEnv {
 			if rmErr := p.cli.VolumeRemove(ctx, claudeSessionsVolume, true); rmErr != nil {
@@ -1288,7 +1331,7 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t
 	if p == nil || p.cli == nil {
 		return ErrNoBackend
 	}
-	volName := ConfigVolumeName(workspaceID)
+	volName := p.resolveConfigVolumeName(ctx, workspaceID)
 	resp, err := p.cli.ContainerCreate(ctx, &container.Config{
 		Image: "alpine",
 		Cmd:   []string{"sh", "-c", writeAuthTokenVolumeCmd()},
@@ -1315,23 +1358,136 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t
 	return nil
 }

+// resolveConfigVolumeName returns the effective config volume name for a
+// workspace.  KI-013 deploy safety: if a legacy truncated-name volume exists,
+// it is migrated in-place to the new full-ID name so existing workspace data
+// is preserved AND all workspaces eventually use collision-safe names.
+func (p *Provisioner) resolveConfigVolumeName(ctx context.Context, workspaceID string) string {
+	if p == nil || p.cli == nil {
+		return ConfigVolumeName(workspaceID)
+	}
+	newName := ConfigVolumeName(workspaceID)
+	legacy := legacyConfigVolumeName(workspaceID)
+	if err := p.migrateVolumeIfNeeded(ctx, newName, legacy); err != nil {
+		log.Printf("Provisioner: volume migration warning for %s: %v", workspaceID, err)
+	}
+	return newName
+}
+
+// resolveClaudeSessionVolumeName returns the effective claude-sessions volume
+// name.  KI-013 deploy safety: legacy truncated-name volumes are migrated
+// in-place to the new full-ID name.
+func (p *Provisioner) resolveClaudeSessionVolumeName(ctx context.Context, workspaceID string) string {
+	if p == nil || p.cli == nil {
+		return ClaudeSessionVolumeName(workspaceID)
+	}
+	newName := ClaudeSessionVolumeName(workspaceID)
+	legacy := legacyClaudeSessionVolumeName(workspaceID)
+	if err := p.migrateVolumeIfNeeded(ctx, newName, legacy); err != nil {
+		log.Printf("Provisioner: session volume migration warning for %s: %v", workspaceID, err)
+	}
+	return newName
+}
+
+// migrateVolumeIfNeeded renames a legacy truncated-name Docker volume to its
+// new full-ID name by copying data via a temporary alpine container.  If the
+// legacy volume does not exist, or the new volume already exists, this is a
+// no-op.  The operation is idempotent — calling it multiple times is safe.
+func (p *Provisioner) migrateVolumeIfNeeded(ctx context.Context, newName, legacyName string) error {
+	if p == nil || p.cli == nil {
+		return nil
+	}
+
+	// Legacy volume missing — nothing to migrate.
+	if _, err := p.cli.VolumeInspect(ctx, legacyName); err != nil {
+		return nil
+	}
+
+	// New volume already exists — migration already done (or new workspace).
+	if _, err := p.cli.VolumeInspect(ctx, newName); err == nil {
+		return nil
+	}
+
+	// Create the new volume.
+	if _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{
+		Name:   newName,
+		Labels: managedLabels(),
+	}); err != nil {
+		return fmt.Errorf("create new volume %s: %w", newName, err)
+	}
+
+	// Copy data from legacy to new via a short-lived alpine container.
+	resp, err := p.cli.ContainerCreate(ctx, &container.Config{
+		Image: "alpine",
+		Cmd:   []string{"sh", "-c", "cp -a /legacy/. /new/"},
+	}, &container.HostConfig{
+		Binds: []string{
+			legacyName + ":/legacy",
+			newName + ":/new",
+		},
+	}, nil, nil, "")
+	if err != nil {
+		return fmt.Errorf("create migration container: %w", err)
+	}
+	defer p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
+
+	if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil {
+		return fmt.Errorf("start migration container: %w", err)
+	}
+
+	waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning)
+	select {
+	case waitResp := <-waitCh:
+		if waitResp.StatusCode != 0 {
+			return fmt.Errorf("migration copy failed (exit %d) — preserving legacy volume %s for retry", waitResp.StatusCode, legacyName)
+		}
+	case err := <-errCh:
+		if err != nil {
+			return fmt.Errorf("migration container exited with error: %w", err)
+		}
+	}
+
+	// Explicitly remove the migration container before removing the legacy
+	// volume so the volume is no longer referenced. The deferred remove above
+	// is a safety-net for early-return paths.
+	_ = p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
+
+	// Best-effort cleanup of the legacy volume.  If removal fails (e.g. still
+	// referenced by a running container), the new volume is already populated
+	// and the next restart will retry.
+	if err := p.cli.VolumeRemove(ctx, legacyName, true); err != nil {
+		log.Printf("Provisioner: warning: failed to remove legacy volume %s after migration: %v", legacyName, err)
+	} else {
+		log.Printf("Provisioner: migrated legacy volume %s → %s", legacyName, newName)
+	}
+	return nil
+}
+
 // RemoveVolume removes the config volume for a workspace.
 // Also removes the claude-sessions volume (best-effort, may not exist
 // for non claude-code runtimes). Issue #12.
+// Also removes the claude-sessions volume (best-effort, may not exist
+// for non claude-code runtimes). Issue #12.
 func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) error {
 	if p == nil || p.cli == nil {
 		return ErrNoBackend
 	}
-	volName := ConfigVolumeName(workspaceID)
-	if err := p.cli.VolumeRemove(ctx, volName, true); err != nil {
-		return fmt.Errorf("failed to remove volume %s: %w", volName, err)
+	// KI-013 deploy safety: remove both new full-ID name and legacy
+	// truncated name if present, so pre-deploy volumes are not orphaned.
+	removed := false
+	for _, volName := range []string{ConfigVolumeName(workspaceID), legacyConfigVolumeName(workspaceID)} {
+		if err := p.cli.VolumeRemove(ctx, volName, true); err == nil {
+			log.Printf("Provisioner: removed config volume %s", volName)
+			removed = true
+		}
 	}
-	log.Printf("Provisioner: removed config volume %s", volName)
-	csName := ClaudeSessionVolumeName(workspaceID)
-	if rmErr := p.cli.VolumeRemove(ctx, csName, true); rmErr != nil {
-		log.Printf("Provisioner: claude-sessions volume cleanup warning for %s: %v", csName, rmErr)
-	} else {
-		log.Printf("Provisioner: removed claude-sessions volume %s", csName)
+	if !removed {
+		return fmt.Errorf("failed to remove config volume for %s", workspaceID)
+	}
+	for _, csName := range []string{ClaudeSessionVolumeName(workspaceID), legacyClaudeSessionVolumeName(workspaceID)} {
+		if rmErr := p.cli.VolumeRemove(ctx, csName, true); rmErr == nil {
+			log.Printf("Provisioner: removed claude-sessions volume %s", csName)
+		}
 	}
 	return nil
 }
@@ -1354,37 +1510,34 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 	if p == nil || p.cli == nil {
 		return ErrNoBackend
 	}
-	name := ContainerName(workspaceID)
-
-	// Force-remove kills and removes in one atomic operation, bypassing
-	// the restart policy entirely.
-	err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
-	if err == nil {
-		log.Printf("Provisioner: stopped and removed container %s", name)
-		return nil
+	// KI-013 deploy safety: try new full-ID name first, then fall back to
+	// the old truncated name so pre-deploy containers are still stoppable.
+	names := []string{ContainerName(workspaceID), legacyContainerName(workspaceID)}
+	for _, name := range names {
+		// Force-remove kills and removes in one atomic operation, bypassing
+		// the restart policy entirely.
+		err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
+		if err == nil {
+			log.Printf("Provisioner: stopped and removed container %s", name)
+			return nil
+		}
+		if isContainerNotFound(err) {
+			// Try the next name (legacy fallback). If both miss, the
+			// container is genuinely gone — post-condition satisfied.
+			continue
+		}
+		if isRemovalInProgress(err) {
+			// Another concurrent caller is already removing this container.
+			log.Printf("Provisioner: container %s removal already in progress (no-op)", name)
+			return nil
+		}
+		// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
+		log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
+		return fmt.Errorf("force-remove %s: %w", name, err)
 	}
-	if isContainerNotFound(err) {
-		// Container was already gone — the post-condition we want is
-		// satisfied. Don't surface as an error.
-		log.Printf("Provisioner: container %s already gone (no-op)", name)
-		return nil
-	}
-	if isRemovalInProgress(err) {
-		// Another concurrent caller (orphan sweeper, sibling cascade
-		// delete, manual `docker rm -f`) is already removing this
-		// container. The post-condition is the same as success: the
-		// container WILL be gone shortly. Surfacing this as a 500 on
-		// cascade-delete causes UI confusion ("workspace marked
-		// removed, but stop call(s) failed — please retry") even
-		// though retrying would just race the same in-flight removal.
-		log.Printf("Provisioner: container %s removal already in progress (no-op)", name)
-		return nil
-	}
-	// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
-	// Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce)
-	// must propagate this so they can skip the follow-up RemoveVolume.
-	log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
-	return fmt.Errorf("force-remove %s: %w", name, err)
+	// Both names missed — container was already gone.
+	log.Printf("Provisioner: container %s already gone (no-op)", ContainerName(workspaceID))
+	return nil
 }

 // IsRunning checks if a workspace container is currently running.
@@ -1440,12 +1593,48 @@ func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool,
 // transient errors into the same "" return as a genuinely-stopped container.
 // That hid daemon flakes as misleading 503 "container not running" responses
 // AND let the two impls drift on edge-case behavior. This is the SSOT.
-func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID string) (string, error) {
+// isNilDockerClient reports whether cli is nil or a typed nil pointer
+// (e.g. (*client.Client)(nil) passed as a dockerClient interface value).
+// Required because a non-nil interface holding a nil pointer does not == nil.
+func isNilDockerClient(cli dockerClient) bool {
 	if cli == nil {
+		return true
+	}
+	switch c := cli.(type) {
+	case *client.Client:
+		return c == nil
+	default:
+		return false
+	}
+}
+
+func RunningContainerName(ctx context.Context, cli dockerClient, workspaceID string) (string, error) {
+	if isNilDockerClient(cli) {
 		return "", ErrNoBackend
 	}
-	name := ContainerName(workspaceID)
-	info, err := cli.ContainerInspect(ctx, name)
+
+	newName := ContainerName(workspaceID)
+	legacyName := legacyContainerName(workspaceID)
+
+	// If a legacy container is still running, rename it in-place to the
+	// new full-ID name so all callers converge on collision-safe names.
+	legacyInfo, legacyErr := cli.ContainerInspect(ctx, legacyName)
+	if legacyErr == nil && legacyInfo.State.Running {
+		if _, newErr := cli.ContainerInspect(ctx, newName); isContainerNotFound(newErr) {
+			if renameErr := cli.ContainerRename(ctx, legacyName, newName); renameErr == nil {
+				log.Printf("Provisioner: renamed legacy container %s → %s", legacyName, newName)
+				return newName, nil
+			} else {
+				log.Printf("Provisioner: warning: failed to rename legacy container %s → %s: %v", legacyName, newName, renameErr)
+			}
+		}
+		// Rename not possible (or new name already occupied) — return legacy
+		// name so the caller can still exec into the live container.
+		return legacyName, nil
+	}
+
+	// Standard path: look for a running container with the new name.
+	info, err := cli.ContainerInspect(ctx, newName)
 	if err != nil {
 		if isContainerNotFound(err) {
 			return "", nil
@@ -1453,7 +1642,7 @@ func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID s
 		return "", err
 	}
 	if info.State.Running {
-		return name, nil
+		return newName, nil
 	}
 	return "", nil
 }
@@ -1504,8 +1693,15 @@ func isRemovalInProgress(err error) bool {
 }

 // DockerClient returns the underlying Docker client for sharing with other handlers.
+// If the provisioner is backed by a fake (e.g. in unit tests), this returns nil.
 func (p *Provisioner) DockerClient() *client.Client {
-	return p.cli
+	if p == nil || p.cli == nil {
+		return nil
+	}
+	if c, ok := p.cli.(*client.Client); ok {
+		return c
+	}
+	return nil
 }

 // Close cleans up the Docker client.
@@ -0,0 +1,461 @@
+package provisioner
+
+import (
+	"context"
+	"errors"
+	"io"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/docker/docker/api/types"
+	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/image"
+	"github.com/docker/docker/api/types/network"
+	"github.com/docker/docker/client"
+	"github.com/docker/docker/api/types/volume"
+	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
+)
+
+// fakeDockerClient is a minimal in-memory implementation of the dockerClient
+// interface used by the provisioner. It records calls and lets tests simulate
+// the legacy/full-ID naming transition without a real Docker daemon.
+type fakeDockerClient struct {
+	mu sync.Mutex
+
+	volumes    map[string]volume.Volume
+	containers map[string]container.InspectResponse
+
+	renameErr         error
+	migrationExitCode int64
+
+	volumeInspectCalls    []string
+	volumeCreateCalls     []volume.CreateOptions
+	volumeRemoveCalls     []string
+	containerInspectCalls []string
+	containerRemoveCalls  []string
+	containerRenameCalls  []struct{ Old, New string }
+	containerCreateCalls  []string
+	containerStartCalls   []string
+}
+
+func newFakeDockerClient() *fakeDockerClient {
+	return &fakeDockerClient{
+		volumes:    make(map[string]volume.Volume),
+		containers: make(map[string]container.InspectResponse),
+	}
+}
+
+func (f *fakeDockerClient) Close() error { return nil }
+
+func (f *fakeDockerClient) ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.containerCreateCalls = append(f.containerCreateCalls, containerName)
+	return container.CreateResponse{ID: "cid-" + containerName}, nil
+}
+
+func (f *fakeDockerClient) ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error) {
+	return types.HijackedResponse{}, errors.New("not implemented")
+}
+
+func (f *fakeDockerClient) ContainerExecCreate(ctx context.Context, ctr string, config container.ExecOptions) (container.ExecCreateResponse, error) {
+	return container.ExecCreateResponse{}, errors.New("not implemented")
+}
+
+func (f *fakeDockerClient) ContainerInspect(ctx context.Context, name string) (container.InspectResponse, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.containerInspectCalls = append(f.containerInspectCalls, name)
+	if c, ok := f.containers[name]; ok {
+		return c, nil
+	}
+	return container.InspectResponse{}, errors.New("No such container: " + name)
+}
+
+func (f *fakeDockerClient) ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (f *fakeDockerClient) ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (f *fakeDockerClient) ContainerRemove(ctx context.Context, name string, options container.RemoveOptions) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.containerRemoveCalls = append(f.containerRemoveCalls, name)
+	if _, ok := f.containers[name]; ok {
+		delete(f.containers, name)
+		return nil
+	}
+	return errors.New("No such container: " + name)
+}
+
+func (f *fakeDockerClient) ContainerRename(ctx context.Context, oldName, newName string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.containerRenameCalls = append(f.containerRenameCalls, struct{ Old, New string }{oldName, newName})
+	if f.renameErr != nil {
+		return f.renameErr
+	}
+	if c, ok := f.containers[oldName]; ok {
+		delete(f.containers, oldName)
+		c.Name = newName
+		f.containers[newName] = c
+	}
+	return nil
+}
+
+func (f *fakeDockerClient) ContainerStart(ctx context.Context, id string, options container.StartOptions) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.containerStartCalls = append(f.containerStartCalls, id)
+	return nil
+}
+
+func (f *fakeDockerClient) ContainerWait(ctx context.Context, id string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) {
+	waitCh := make(chan container.WaitResponse, 1)
+	errCh := make(chan error)
+	f.mu.Lock()
+	code := f.migrationExitCode
+	f.mu.Unlock()
+	waitCh <- container.WaitResponse{StatusCode: code}
+	close(waitCh)
+	return waitCh, errCh
+}
+
+func (f *fakeDockerClient) CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error {
+	return errors.New("not implemented")
+}
+
+func (f *fakeDockerClient) ImageInspect(ctx context.Context, img string, opts ...client.ImageInspectOption) (image.InspectResponse, error) {
+	return image.InspectResponse{}, errors.New("not implemented")
+}
+
+func (f *fakeDockerClient) ImagePull(ctx context.Context, ref string, opts image.PullOptions) (io.ReadCloser, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (f *fakeDockerClient) VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.volumeCreateCalls = append(f.volumeCreateCalls, options)
+	v := volume.Volume{Name: options.Name, Labels: options.Labels}
+	f.volumes[options.Name] = v
+	return v, nil
+}
+
+func (f *fakeDockerClient) VolumeInspect(ctx context.Context, name string) (volume.Volume, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.volumeInspectCalls = append(f.volumeInspectCalls, name)
+	if v, ok := f.volumes[name]; ok {
+		return v, nil
+	}
+	return volume.Volume{}, errors.New("No such volume: " + name)
+}
+
+func (f *fakeDockerClient) VolumeRemove(ctx context.Context, name string, force bool) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.volumeRemoveCalls = append(f.volumeRemoveCalls, name)
+	if _, ok := f.volumes[name]; ok {
+		delete(f.volumes, name)
+		return nil
+	}
+	return errors.New("No such volume: " + name)
+}
+
+func runningContainer(name string) container.InspectResponse {
+	return container.InspectResponse{
+		ContainerJSONBase: &container.ContainerJSONBase{
+			Name:  name,
+			State: &container.State{Running: true},
+		},
+	}
+}
+
+const migrateTestWorkspaceID = "abcdef1234567890"
+
+func TestResolveConfigVolumeName_LegacyExists_MigratesInPlace(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+	p := &Provisioner{cli: cli}
+
+	newName := ConfigVolumeName(migrateTestWorkspaceID)
+	legacyName := legacyConfigVolumeName(migrateTestWorkspaceID)
+	cli.volumes[legacyName] = volume.Volume{Name: legacyName}
+
+	got := p.resolveConfigVolumeName(ctx, migrateTestWorkspaceID)
+	if got != newName {
+		t.Fatalf("resolveConfigVolumeName: got %q, want %q", got, newName)
+	}
+
+	// Migration path must inspect both names, then create new volume and copy.
+	if !strings.Contains(strings.Join(cli.volumeInspectCalls, ","), legacyName) {
+		t.Fatalf("expected VolumeInspect(%s) for legacy presence check", legacyName)
+	}
+	if !strings.Contains(strings.Join(cli.volumeInspectCalls, ","), newName) {
+		t.Fatalf("expected VolumeInspect(%s) to check for existing new volume", newName)
+	}
+	if len(cli.volumeCreateCalls) == 0 || cli.volumeCreateCalls[0].Name != newName {
+		t.Fatalf("expected VolumeCreate(%s)", newName)
+	}
+	if len(cli.containerCreateCalls) == 0 {
+		t.Fatal("expected migration container creation")
+	}
+	if len(cli.containerStartCalls) == 0 {
+		t.Fatal("expected migration container start")
+	}
+	if len(cli.containerRemoveCalls) == 0 {
+		t.Fatal("expected migration container removal")
+	}
+
+	// Legacy volume must be removed so it is not orphaned.
+	removed := false
+	for _, n := range cli.volumeRemoveCalls {
+		if n == legacyName {
+			removed = true
+			break
+		}
+	}
+	if !removed {
+		t.Fatalf("legacy volume %s was not removed after migration — orphan risk", legacyName)
+	}
+	if _, ok := cli.volumes[legacyName]; ok {
+		t.Fatalf("legacy volume %s still present in fake client after migration", legacyName)
+	}
+	if _, ok := cli.volumes[newName]; !ok {
+		t.Fatalf("new volume %s must exist after migration", newName)
+	}
+}
+
+func TestResolveConfigVolumeName_LegacyAbsent_NoMigration(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+	p := &Provisioner{cli: cli}
+
+	newName := ConfigVolumeName(migrateTestWorkspaceID)
+	legacyName := legacyConfigVolumeName(migrateTestWorkspaceID)
+
+	got := p.resolveConfigVolumeName(ctx, migrateTestWorkspaceID)
+	if got != newName {
+		t.Fatalf("resolveConfigVolumeName: got %q, want %q", got, newName)
+	}
+
+	// Should check legacy once and short-circuit.
+	if len(cli.volumeInspectCalls) != 1 || cli.volumeInspectCalls[0] != legacyName {
+		t.Fatalf("expected exactly one VolumeInspect call for legacy name, got %v", cli.volumeInspectCalls)
+	}
+	if len(cli.volumeCreateCalls) != 0 {
+		t.Fatalf("expected no VolumeCreate when legacy absent, got %v", cli.volumeCreateCalls)
+	}
+	if len(cli.volumeRemoveCalls) != 0 {
+		t.Fatalf("expected no VolumeRemove when legacy absent, got %v", cli.volumeRemoveCalls)
+	}
+}
+
+func TestResolveClaudeSessionVolumeName_LegacyExists_MigratesInPlace(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+	p := &Provisioner{cli: cli}
+
+	newName := ClaudeSessionVolumeName(migrateTestWorkspaceID)
+	legacyName := legacyClaudeSessionVolumeName(migrateTestWorkspaceID)
+	cli.volumes[legacyName] = volume.Volume{Name: legacyName}
+
+	got := p.resolveClaudeSessionVolumeName(ctx, migrateTestWorkspaceID)
+	if got != newName {
+		t.Fatalf("resolveClaudeSessionVolumeName: got %q, want %q", got, newName)
+	}
+
+	removed := false
+	for _, n := range cli.volumeRemoveCalls {
+		if n == legacyName {
+			removed = true
+			break
+		}
+	}
+	if !removed {
+		t.Fatalf("legacy session volume %s was not removed after migration — orphan risk", legacyName)
+	}
+	if _, ok := cli.volumes[newName]; !ok {
+		t.Fatalf("new session volume %s must exist after migration", newName)
+	}
+}
+
+func TestResolveClaudeSessionVolumeName_LegacyAbsent_NoMigration(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+	p := &Provisioner{cli: cli}
+
+	newName := ClaudeSessionVolumeName(migrateTestWorkspaceID)
+	legacyName := legacyClaudeSessionVolumeName(migrateTestWorkspaceID)
+
+	got := p.resolveClaudeSessionVolumeName(ctx, migrateTestWorkspaceID)
+	if got != newName {
+		t.Fatalf("resolveClaudeSessionVolumeName: got %q, want %q", got, newName)
+	}
+	if len(cli.volumeInspectCalls) != 1 || cli.volumeInspectCalls[0] != legacyName {
+		t.Fatalf("expected exactly one VolumeInspect call for legacy session name, got %v", cli.volumeInspectCalls)
+	}
+	if len(cli.volumeCreateCalls) != 0 {
+		t.Fatalf("expected no VolumeCreate when legacy absent, got %v", cli.volumeCreateCalls)
+	}
+}
+
+func TestMigrateVolumeIfNeeded_CopyFails_PreservesLegacy(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+	p := &Provisioner{cli: cli}
+
+	newName := ConfigVolumeName(migrateTestWorkspaceID)
+	legacyName := legacyConfigVolumeName(migrateTestWorkspaceID)
+	cli.volumes[legacyName] = volume.Volume{Name: legacyName}
+	cli.migrationExitCode = 1
+
+	if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err == nil {
+		t.Fatal("expected migration error when copy container exits non-zero")
+	}
+
+	// Legacy volume must survive a failed copy so no data is lost.
+	if _, ok := cli.volumes[legacyName]; !ok {
+		t.Fatal("legacy volume must be preserved when migration copy fails (data-loss guard)")
+	}
+}
+
+func TestStop_FullIDAbsent_LegacyRemoved(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+	p := &Provisioner{cli: cli}
+
+	newName := ContainerName(migrateTestWorkspaceID)
+	legacyName := legacyContainerName(migrateTestWorkspaceID)
+	cli.containers[legacyName] = runningContainer(legacyName)
+
+	if err := p.Stop(ctx, migrateTestWorkspaceID); err != nil {
+		t.Fatalf("Stop failed: %v", err)
+	}
+
+	if len(cli.containerRemoveCalls) < 2 {
+		t.Fatalf("expected Remove on full-id then legacy, got %v", cli.containerRemoveCalls)
+	}
+	if cli.containerRemoveCalls[0] != newName {
+		t.Fatalf("expected first remove target %q, got %q", newName, cli.containerRemoveCalls[0])
+	}
+	if cli.containerRemoveCalls[1] != legacyName {
+		t.Fatalf("expected second remove target %q, got %q", legacyName, cli.containerRemoveCalls[1])
+	}
+	if _, ok := cli.containers[legacyName]; ok {
+		t.Fatal("legacy container still present after Stop")
+	}
+}
+
+func TestStop_BothAbsent_IsNoOp(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+	p := &Provisioner{cli: cli}
+
+	newName := ContainerName(migrateTestWorkspaceID)
+	legacyName := legacyContainerName(migrateTestWorkspaceID)
+
+	if err := p.Stop(ctx, migrateTestWorkspaceID); err != nil {
+		t.Fatalf("Stop failed: %v", err)
+	}
+
+	if len(cli.containerRemoveCalls) != 2 {
+		t.Fatalf("expected 2 remove attempts, got %v", cli.containerRemoveCalls)
+	}
+	if cli.containerRemoveCalls[0] != newName || cli.containerRemoveCalls[1] != legacyName {
+		t.Fatalf("expected remove order [%q, %q], got %v", newName, legacyName, cli.containerRemoveCalls)
+	}
+}
+
+func TestRunningContainerName_LegacyRunning_RenameFails_FallsBackToLegacy(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+
+	newName := ContainerName(migrateTestWorkspaceID)
+	legacyName := legacyContainerName(migrateTestWorkspaceID)
+	cli.containers[legacyName] = runningContainer(legacyName)
+	cli.renameErr = errors.New("daemon rename failed")
+
+	got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID)
+	if err != nil {
+		t.Fatalf("RunningContainerName returned error: %v", err)
+	}
+	if got != legacyName {
+		t.Fatalf("expected fallback to legacy name %q, got %q", legacyName, got)
+	}
+
+	if len(cli.containerInspectCalls) < 2 {
+		t.Fatalf("expected inspect of legacy and new names, got %v", cli.containerInspectCalls)
+	}
+	if cli.containerInspectCalls[0] != legacyName {
+		t.Fatalf("expected legacy inspect first, got %v", cli.containerInspectCalls)
+	}
+
+	renamed := false
+	for _, r := range cli.containerRenameCalls {
+		if r.Old == legacyName && r.New == newName {
+			renamed = true
+			break
+		}
+	}
+	if !renamed {
+		t.Fatalf("expected rename attempt %q -> %q, got %v", legacyName, newName, cli.containerRenameCalls)
+	}
+}
+
+func TestRunningContainerName_LegacyRunning_RenameSucceeds(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+
+	newName := ContainerName(migrateTestWorkspaceID)
+	legacyName := legacyContainerName(migrateTestWorkspaceID)
+	cli.containers[legacyName] = runningContainer(legacyName)
+
+	got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID)
+	if err != nil {
+		t.Fatalf("RunningContainerName returned error: %v", err)
+	}
+	if got != newName {
+		t.Fatalf("expected new name %q after rename, got %q", newName, got)
+	}
+	if _, ok := cli.containers[legacyName]; ok {
+		t.Fatal("legacy container should have been renamed away")
+	}
+	if _, ok := cli.containers[newName]; !ok {
+		t.Fatal("new container name should exist after rename")
+	}
+}
+
+func TestRunningContainerName_NewRunning_ReturnsNew(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+
+	newName := ContainerName(migrateTestWorkspaceID)
+	cli.containers[newName] = runningContainer(newName)
+
+	got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID)
+	if err != nil {
+		t.Fatalf("RunningContainerName returned error: %v", err)
+	}
+	if got != newName {
+		t.Fatalf("expected new name %q, got %q", newName, got)
+	}
+}
+
+func TestRunningContainerName_BothAbsent_ReturnsEmpty(t *testing.T) {
+	ctx := context.Background()
+	cli := newFakeDockerClient()
+
+	got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID)
+	if err != nil {
+		t.Fatalf("RunningContainerName returned error: %v", err)
+	}
+	if got != "" {
+		t.Fatalf("expected empty name when neither container exists, got %q", got)
+	}
+}
@@ -2,14 +2,19 @@ package provisioner

 import (
 	"archive/tar"
+	"context"
 	"errors"
 	"io"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"testing"
+	"time"

 	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/volume"
+	"github.com/docker/docker/client"
 )

 // TestValidateConfigSource covers issue #17: a workspace restart with no
@@ -425,7 +430,7 @@ func TestContainerName(t *testing.T) {
 	}{
 		{"short", "ws-short"},
 		{"exactly12ch", "ws-exactly12ch"},
-		{"longer-than-twelve-characters", "ws-longer-than-"},
+		{"longer-than-twelve-characters", "ws-longer-than-twelve-characters"},
 		{"abc", "ws-abc"},
 	}

@@ -437,6 +442,17 @@ func TestContainerName(t *testing.T) {
 	}
 }

+// TestContainerName_DistinctSamePrefix12 is a regression guard for KI-013:
+// two UUIDs sharing the same first 12 characters must produce distinct
+// container names (the old 12-char truncation caused collisions).
+func TestContainerName_DistinctSamePrefix12(t *testing.T) {
+	id1 := "123456789abc-4def-1234-567890abcdef"
+	id2 := "123456789abc-4def-1234-567890abcdf0"
+	if ContainerName(id1) == ContainerName(id2) {
+		t.Fatalf("ContainerName must differ for same-first-12 UUIDs: both = %q", ContainerName(id1))
+	}
+}
+
 // TestConfigVolumeName verifies config volume naming.
 func TestConfigVolumeName(t *testing.T) {
 	tests := []struct {
@@ -445,7 +461,7 @@ func TestConfigVolumeName(t *testing.T) {
 	}{
 		{"short", "ws-short-configs"},
 		{"exactly12ch", "ws-exactly12ch-configs"},
-		{"longer-than-twelve-characters", "ws-longer-than--configs"},
+		{"longer-than-twelve-characters", "ws-longer-than-twelve-characters-configs"},
 		{"abc", "ws-abc-configs"},
 	}

@@ -457,10 +473,19 @@ func TestConfigVolumeName(t *testing.T) {
 	}
 }

+// TestConfigVolumeName_DistinctSamePrefix12 is a regression guard for KI-013.
+func TestConfigVolumeName_DistinctSamePrefix12(t *testing.T) {
+	id1 := "123456789abc-4def-1234-567890abcdef"
+	id2 := "123456789abc-4def-1234-567890abcdf0"
+	if ConfigVolumeName(id1) == ConfigVolumeName(id2) {
+		t.Fatalf("ConfigVolumeName must differ for same-first-12 UUIDs: both = %q", ConfigVolumeName(id1))
+	}
+}
+
 // ---------- #12 — claude-sessions volume naming ----------

 // TestClaudeSessionVolumeName_Deterministic: same ID → same volume name, and
-// the name follows the ws-<id[:12]>-claude-sessions shape used everywhere
+// the name follows the ws-<id>-claude-sessions shape used everywhere
 // else in the provisioner.
 func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
 	tests := []struct {
@@ -469,7 +494,7 @@ func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
 	}{
 		{"short", "ws-short-claude-sessions"},
 		{"exactly12ch", "ws-exactly12ch-claude-sessions"},
-		{"longer-than-twelve-characters", "ws-longer-than--claude-sessions"},
+		{"longer-than-twelve-characters", "ws-longer-than-twelve-characters-claude-sessions"},
 		{"abc", "ws-abc-claude-sessions"},
 	}
 	for _, tt := range tests {
@@ -484,6 +509,15 @@ func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
 	}
 }

+// TestClaudeSessionVolumeName_DistinctSamePrefix12 is a regression guard for KI-013.
+func TestClaudeSessionVolumeName_DistinctSamePrefix12(t *testing.T) {
+	id1 := "123456789abc-4def-1234-567890abcdef"
+	id2 := "123456789abc-4def-1234-567890abcdf0"
+	if ClaudeSessionVolumeName(id1) == ClaudeSessionVolumeName(id2) {
+		t.Fatalf("ClaudeSessionVolumeName must differ for same-first-12 UUIDs: both = %q", ClaudeSessionVolumeName(id1))
+	}
+}
+
 // TestClaudeSessionVolumeName_DistinctFromConfig ensures we never alias the
 // claude-sessions volume onto the config volume (deleting one must not wipe
 // the other in RemoveVolume's cleanup path).
@@ -1393,3 +1427,181 @@ func TestApplyTierConfig_T3_DefaultCap(t *testing.T) {
 		t.Errorf("T3 default NanoCPUs: got %d, want %d", hc.NanoCPUs, wantCPU)
 	}
 }
+
+// TestMigrateVolumeIfNeeded_ExistingTruncatedVolume verifies the KI-013 deploy
+// safety path: when a legacy truncated-name volume already exists, data is
+// copied to the new full-ID name and the legacy volume is removed.  Existing
+// workspace state is preserved without operator intervention.
+func TestMigrateVolumeIfNeeded_ExistingTruncatedVolume(t *testing.T) {
+	ctx := context.Background()
+
+	cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
+	if err != nil {
+		t.Skip("docker client unavailable:", err)
+	}
+	if _, pingErr := cli.Ping(ctx); pingErr != nil {
+		t.Skip("docker daemon unreachable:", pingErr)
+	}
+
+	p := &Provisioner{cli: cli}
+	workspaceID := "test-migrate-" + strconv.FormatInt(time.Now().UnixNano(), 10)
+	legacyName := legacyConfigVolumeName(workspaceID)
+	newName := ConfigVolumeName(workspaceID)
+
+	// Cleanup before and after (defensive — avoid pollution on retries).
+	_ = cli.VolumeRemove(ctx, legacyName, true)
+	_ = cli.VolumeRemove(ctx, newName, true)
+	defer func() {
+		_ = cli.VolumeRemove(ctx, legacyName, true)
+		_ = cli.VolumeRemove(ctx, newName, true)
+	}()
+
+	// 1. Create legacy volume and seed it with a sentinel file.
+	if _, err := cli.VolumeCreate(ctx, volume.CreateOptions{Name: legacyName}); err != nil {
+		t.Fatalf("create legacy volume: %v", err)
+	}
+	seedResp, err := cli.ContainerCreate(ctx, &container.Config{
+		Image: "alpine",
+		Cmd:   []string{"sh", "-c", "echo sentinel-data > /vol/sentinel.txt"},
+	}, &container.HostConfig{
+		Binds: []string{legacyName + ":/vol"},
+	}, nil, nil, "")
+	if err != nil {
+		t.Fatalf("create seed container: %v", err)
+	}
+	defer cli.ContainerRemove(ctx, seedResp.ID, container.RemoveOptions{Force: true})
+	if err := cli.ContainerStart(ctx, seedResp.ID, container.StartOptions{}); err != nil {
+		t.Fatalf("start seed container: %v", err)
+	}
+	waitCh, errCh := cli.ContainerWait(ctx, seedResp.ID, container.WaitConditionNotRunning)
+	select {
+	case <-waitCh:
+	case err := <-errCh:
+		if err != nil {
+			t.Fatalf("seed container failed: %v", err)
+		}
+	}
+	// Remove the seed container before migration so the legacy volume is
+	// no longer referenced by any container. The deferred remove above is a
+	// safety net for panic/early-return paths.
+	if err := cli.ContainerRemove(ctx, seedResp.ID, container.RemoveOptions{Force: true}); err != nil {
+		t.Fatalf("remove seed container: %v", err)
+	}
+
+	// 2. Run migration.
+	if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err != nil {
+		t.Fatalf("migrateVolumeIfNeeded failed: %v", err)
+	}
+
+	// 3. Legacy volume must be gone.
+	if _, inspectErr := cli.VolumeInspect(ctx, legacyName); inspectErr == nil {
+		t.Fatalf("legacy volume %s still exists after migration", legacyName)
+	}
+
+	// 4. New volume must exist and contain the sentinel file.
+	if _, inspectErr := cli.VolumeInspect(ctx, newName); inspectErr != nil {
+		t.Fatalf("new volume %s does not exist after migration: %v", newName, inspectErr)
+	}
+
+	readResp, err := cli.ContainerCreate(ctx, &container.Config{
+		Image: "alpine",
+		Cmd:   []string{"cat", "/vol/sentinel.txt"},
+	}, &container.HostConfig{
+		Binds: []string{newName + ":/vol"},
+	}, nil, nil, "")
+	if err != nil {
+		t.Fatalf("create read container: %v", err)
+	}
+	defer cli.ContainerRemove(ctx, readResp.ID, container.RemoveOptions{Force: true})
+	if err := cli.ContainerStart(ctx, readResp.ID, container.StartOptions{}); err != nil {
+		t.Fatalf("start read container: %v", err)
+	}
+	waitCh, errCh = cli.ContainerWait(ctx, readResp.ID, container.WaitConditionNotRunning)
+	select {
+	case <-waitCh:
+	case err := <-errCh:
+		if err != nil {
+			t.Fatalf("read container failed: %v", err)
+		}
+	}
+
+	logs, err := cli.ContainerLogs(ctx, readResp.ID, container.LogsOptions{ShowStdout: true})
+	if err != nil {
+		t.Fatalf("read container logs: %v", err)
+	}
+	defer logs.Close()
+	data, err := io.ReadAll(logs)
+	if err != nil {
+		t.Fatalf("read logs: %v", err)
+	}
+	if !strings.Contains(string(data), "sentinel-data") {
+		t.Fatalf("new volume missing sentinel data; logs: %q", data)
+	}
+
+	// 5. Idempotency: second migration must be a no-op.
+	if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err != nil {
+		t.Fatalf("second migration (idempotency) failed: %v", err)
+	}
+}
+
+// TestInternalURL verifies the container-internal URL shape.
+func TestInternalURL(t *testing.T) {
+	tests := []struct {
+		id   string
+		want string
+	}{
+		{"abc123", "http://ws-abc123:8000"},
+		{"longer-than-twelve-characters", "http://ws-longer-than-twelve-characters:8000"},
+		{"", "http://ws-:8000"},
+	}
+	for _, tt := range tests {
+		got := InternalURL(tt.id)
+		if got != tt.want {
+			t.Errorf("InternalURL(%q) = %q, want %q", tt.id, got, tt.want)
+		}
+	}
+}
+
+// TestApplyTierResources verifies the direct memory/CPU resource application.
+// Note: the T2-fallback for unknown tiers is handled by ApplyTierConfig, not
+// this low-level helper.
+func TestApplyTierResources(t *testing.T) {
+	for _, k := range []string{"TIER2_MEMORY_MB", "TIER2_CPU_SHARES", "TIER3_MEMORY_MB", "TIER3_CPU_SHARES", "TIER4_MEMORY_MB", "TIER4_CPU_SHARES"} {
+		os.Unsetenv(k)
+	}
+
+	tests := []struct {
+		name         string
+		tier         int
+		wantMemory   int64
+		wantNanoCPUs int64
+		wantShares   int64
+	}{
+		{"T1 no cap", 1, 0, 0, 0},
+		{"T2 512MiB 1CPU", 2, 512 * 1024 * 1024, 1_000_000_000, 1024},
+		{"T3 2048MiB 2CPU", 3, 2048 * 1024 * 1024, 2_000_000_000, 2048},
+		{"T4 4096MiB 4CPU", 4, 4096 * 1024 * 1024, 4_000_000_000, 4096},
+		{"unknown tier no cap", 99, 0, 0, 0},
+		{"zero tier no cap", 0, 0, 0, 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			hc := &container.HostConfig{}
+			memMB, cpuShares := applyTierResources(hc, tt.tier)
+
+			if memMB != tt.wantMemory/(1024*1024) {
+				t.Errorf("memMB = %d, want %d", memMB, tt.wantMemory/(1024*1024))
+			}
+			if hc.Memory != tt.wantMemory {
+				t.Errorf("Memory = %d, want %d", hc.Memory, tt.wantMemory)
+			}
+			if hc.NanoCPUs != tt.wantNanoCPUs {
+				t.Errorf("NanoCPUs = %d, want %d", hc.NanoCPUs, tt.wantNanoCPUs)
+			}
+			if cpuShares != tt.wantShares {
+				t.Errorf("cpuShares = %d, want %d", cpuShares, tt.wantShares)
+			}
+		})
+	}
+}
@@ -234,6 +234,13 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 		// this specific workspace, or a control-plane-verified tenant session.
 		wsAuth.PATCH("", wh.Update)

+		// Compute options — SSOT for the canvas Container-Config tab's cloud-
+		// provider + instance-type dropdowns (core#2489). Returns the same
+		// provider/instance metadata validateWorkspaceCompute enforces, so the UI
+		// can never offer a (provider, instance-type) the PATCH then rejects with
+		// a 400. Static (derived from the in-binary allowlist) — no DB round-trip.
+		wsAuth.GET("/compute-options", wh.ComputeOptions)
+
 		// Lifecycle
 		wsAuth.GET("/state", wh.State)
 		wsAuth.POST("/restart", wh.Restart)
@@ -1163,3 +1163,109 @@ func TestSanitizeUTF8(t *testing.T) {
 		t.Errorf("sanitizeUTF8 did not produce valid UTF-8: %x", []byte(out))
 	}
 }
+
+// ── TestClassifyTaskState ───────────────────────────────────────────────────
+
+func TestClassifyTaskState_NoStatus(t *testing.T) {
+	result := map[string]json.RawMessage{"other": json.RawMessage(`"x"`)}
+	if got := classifyTaskState(result); got != "" {
+		t.Errorf("classifyTaskState(no status) = %q, want empty", got)
+	}
+}
+
+func TestClassifyTaskState_OKStates(t *testing.T) {
+	for _, state := range []string{"", "submitted", "working", "completed"} {
+		result := map[string]json.RawMessage{
+			"status": json.RawMessage(`{"state":"` + state + `"}`),
+		}
+		if got := classifyTaskState(result); got != "" {
+			t.Errorf("classifyTaskState(%q) = %q, want empty (OK state)", state, got)
+		}
+	}
+}
+
+func TestClassifyTaskState_FailureState(t *testing.T) {
+	result := map[string]json.RawMessage{
+		"status": json.RawMessage(`{"state":"failed"}`),
+	}
+	if got := classifyTaskState(result); got != "failed" {
+		t.Errorf("classifyTaskState(failed) = %q, want failed", got)
+	}
+}
+
+func TestClassifyTaskState_MalformedStatus(t *testing.T) {
+	result := map[string]json.RawMessage{
+		"status": json.RawMessage(`{broken`),
+	}
+	if got := classifyTaskState(result); got != "" {
+		t.Errorf("classifyTaskState(malformed) = %q, want empty", got)
+	}
+}
+
+// ── TestIsEmptyResponse ─────────────────────────────────────────────────────
+
+func TestIsEmptyResponse_EmptyBody(t *testing.T) {
+	if !isEmptyResponse([]byte{}) {
+		t.Error("isEmptyResponse(empty) should be true")
+	}
+}
+
+func TestIsEmptyResponse_NoResponseGenerated(t *testing.T) {
+	if !isEmptyResponse([]byte(`(no response generated)`)) {
+		t.Error("isEmptyResponse(no-response-generated) should be true")
+	}
+}
+
+func TestIsEmptyResponse_TextFieldEmpty(t *testing.T) {
+	if !isEmptyResponse([]byte(`{"result":{"parts":[{"text":""}]}}`)) {
+		t.Error("isEmptyResponse(empty text field) should be true")
+	}
+}
+
+func TestIsEmptyResponse_TextFieldNoResponse(t *testing.T) {
+	if !isEmptyResponse([]byte(`{"result":{"parts":[{"text":"(no response generated)"}]}}`)) {
+		t.Error("isEmptyResponse(text=no-response-generated) should be true")
+	}
+}
+
+func TestIsEmptyResponse_HasContent(t *testing.T) {
+	if isEmptyResponse([]byte(`{"result":{"parts":[{"text":"hello"}]}}`)) {
+		t.Error("isEmptyResponse(with content) should be false")
+	}
+}
+
+// ── TestA2AErrorFromBody ────────────────────────────────────────────────────
+
+func TestA2AErrorFromBody_Empty(t *testing.T) {
+	if got := a2aErrorFromBody([]byte{}); got != "" {
+		t.Errorf("a2aErrorFromBody(empty) = %q, want empty", got)
+	}
+}
+
+func TestA2AErrorFromBody_JSONRPCMessage(t *testing.T) {
+	body := []byte(`{"error":{"code":-32603,"message":"internal error"}}`)
+	if got := a2aErrorFromBody(body); got != "internal error" {
+		t.Errorf("a2aErrorFromBody(JSON-RPC) = %q, want internal error", got)
+	}
+}
+
+func TestA2AErrorFromBody_PlainString(t *testing.T) {
+	body := []byte(`{"error":"something went wrong"}`)
+	if got := a2aErrorFromBody(body); got != "something went wrong" {
+		t.Errorf("a2aErrorFromBody(plain) = %q, want something went wrong", got)
+	}
+}
+
+func TestA2AErrorFromBody_NoError(t *testing.T) {
+	body := []byte(`{"result":"ok"}`)
+	if got := a2aErrorFromBody(body); got != "" {
+		t.Errorf("a2aErrorFromBody(no error) = %q, want empty", got)
+	}
+}
+
+func TestA2AErrorFromBody_InvalidJSON(t *testing.T) {
+	body := []byte(`{broken`)
+	if got := a2aErrorFromBody(body); got != "" {
+		t.Errorf("a2aErrorFromBody(invalid) = %q, want empty", got)
+	}
+}