Merge pull request #2202 from Molecule-AI/staging

staging → main: e2e teardown patience (#2201) one-time bridge
2026-04-28 12:40:38 -07:00 · 2026-04-28 12:40:38 -07:00 · 9f2878d185
commit 9f2878d185
parent 1c8cf10728 588e67840b
11 changed files with 1009 additions and 80 deletions
--- a/.github/workflows/publish-runtime.yml
+++ b/.github/workflows/publish-runtime.yml
@ -79,6 +79,7 @@ jobs:
      id-token: write   # PyPI Trusted Publisher (OIDC) — no PYPI_TOKEN needed
    outputs:
      version: ${{ steps.version.outputs.version }}
+      wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
    steps:
      - uses: actions/checkout@v4

@ -129,6 +130,28 @@ jobs:
        working-directory: ${{ runner.temp }}/runtime-build
        run: python -m build

+      - name: Capture wheel SHA256 for cascade content-verification
+        # Recorded BEFORE upload so the cascade probe can verify the
+        # bytes Fastly serves under the new version's URL match what
+        # we built. Closes a hole left by #2197: that probe verified
+        # pip can resolve the version (catches propagation lag) but
+        # not that the wheel content matches (would silently pass a
+        # Fastly stale-content scenario where the new version's URL
+        # serves an old wheel binary).
+        id: wheel_hash
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: |
+          set -eu
+          WHEEL=$(ls dist/*.whl 2>/dev/null | head -1)
+          if [ -z "$WHEEL" ]; then
+            echo "::error::No .whl in dist/ — `python -m build` must have failed silently"
+            exit 1
+          fi
+          HASH=$(sha256sum "$WHEEL" | awk '{print $1}')
+          echo "wheel_sha256=${HASH}" >> "$GITHUB_OUTPUT"
+          echo "Local wheel SHA256 (pre-upload): ${HASH}"
+          echo "Wheel filename: $(basename "$WHEEL")"
+
      - name: Verify package contents (sanity)
        working-directory: ${{ runner.temp }}/runtime-build
        run: |
@ -212,6 +235,56 @@ jobs:
              default_output_modes=['text/plain', 'application/json'],
          )
          print('✓ AgentCard call-shape smoke passed')
+
+          # Well-known agent-card path probe alignment. main.py's
+          # _send_initial_prompt() polls AGENT_CARD_WELL_KNOWN_PATH
+          # to know when the local A2A server is ready. If the SDK
+          # ever splits the constant value from the path that
+          # create_agent_card_routes() actually mounts at, every
+          # workspace silently drops its initial_prompt:
+          #   - Probe gets 404 every attempt.
+          #   - Falls through to 'server not ready after 30s,
+          #     skipping' even though the server is fine.
+          #   - The user hits a fresh chat with no kickoff context.
+          # This was the #2193 incident class — the v0.x → v1.x
+          # rename of /.well-known/agent.json → /.well-known/agent-card.json
+          # plus the constant itself moving to a2a.utils.constants.
+          # source-tree pytest (test_agent_card_well_known_path.py)
+          # catches main.py-side regressions; this catches the
+          # SDK-side ones BEFORE PyPI upload.
+          from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
+          from a2a.server.routes import create_agent_card_routes
+          mounted_paths = [
+              getattr(r, 'path', None)
+              for r in create_agent_card_routes(
+                  AgentCard(
+                      name='wk-smoke',
+                      description='well-known mount alignment',
+                      version='0.0.0-smoke',
+                  )
+              )
+          ]
+          assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, (
+              f'AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) '
+              f'is NOT among paths mounted by create_agent_card_routes '
+              f'({mounted_paths!r}). The SDK constant and its own route '
+              f'factory have drifted — workspace probes will 404 forever, '
+              f'silently dropping every workspace initial_prompt.'
+          )
+          print(f'✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})')
+
+          # Message helper smoke. a2a-sdk renamed
+          # new_agent_text_message → new_text_message in the v1.x
+          # protobuf-flat migration (per the v0→v1 cheat sheet). main.py
+          # and a2a_executor.py call new_text_message in hot paths; if
+          # the import breaks, every reply errors with ImportError before
+          # the message even leaves the workspace. Importing here
+          # catches a future v2.x rename at publish time.
+          from a2a.helpers import new_text_message
+          msg = new_text_message('smoke')
+          assert msg is not None, 'new_text_message returned None'
+          print('✓ message helper import + call OK')
+
          print('✓ smoke import passed')
          "

@ -239,28 +312,98 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Wait for PyPI to propagate the new version
-        # PyPI accepts the upload, then takes a few seconds to make it
-        # available via the package index. If the cascade fires too
-        # fast, downstream template builds run `pip install` against
-        # an index that hasn't seen the new version yet — they resolve
-        # to the previous one, and docker layer cache then locks that
-        # in for subsequent rebuilds (the cache trap that bit us five
-        # times tonight).
+        # PyPI accepts the upload, then takes a few seconds to make the
+        # new version visible across all THREE surfaces pip touches:
+        #   1. /pypi/<pkg>/<ver>/json — metadata endpoint
+        #   2. /simple/<pkg>/         — pip's primary download index
+        #   3. files.pythonhosted.org — CDN-fronted wheel binary
+        # Each has its own cache. The previous check polled only (1)
+        # and would let the cascade fire while (2) or (3) still served
+        # the previous version, so downstream `pip install` resolved
+        # to the old wheel. Docker layer cache then locked that stale
+        # resolution in for subsequent rebuilds (the cache trap that
+        # bit us five times in one night).
        #
-        # Poll PyPI's JSON API for up to 60s. Cheap (~50ms per poll),
-        # avoids over-trusting "publish job said success."
+        # Two-stage probe per poll:
+        #   (a) `pip install --no-cache-dir PACKAGE==VERSION` — succeeds
+        #       only when the version is resolvable. Catches surface (1)
+        #       and (2) propagation lag.
+        #   (b) `pip download` of the same wheel + SHA256 compare against
+        #       the just-built dist's hash. Catches surface (3) lag AND
+        #       Fastly serving stale content under the new version's URL
+        #       (a separate Fastly-corruption mode that pip-install alone
+        #       can't see, since pip install resolves+unpacks against
+        #       whatever bytes Fastly returns and never inspects them).
+        # Both must pass before the cascade fans out.
+        #
+        # The venv is reused across polls; only `pip install`/`pip
+        # download` run in the loop, with --force-reinstall +
+        # --no-cache-dir so the previous poll's cached state doesn't
+        # mask propagation lag.
        env:
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
+          EXPECTED_SHA256: ${{ needs.publish.outputs.wheel_sha256 }}
        run: |
          set -eu
+          if [ -z "$EXPECTED_SHA256" ]; then
+            echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
+            exit 1
+          fi
+          python -m venv /tmp/propagation-probe
+          PROBE=/tmp/propagation-probe/bin
+          $PROBE/pip install --upgrade --quiet pip
+          # Poll budget: 30 attempts × (~3-5s pip install + ~3s pip
+          # download + 4s sleep) ≈ 5-6 min wall on a slow GH runner.
+          # Generous vs PyPI's typical few-seconds propagation;
+          # failures past this are signal of a real PyPI / Fastly
+          # issue, not just lag.
          for i in $(seq 1 30); do
-            if curl -fsS "https://pypi.org/pypi/molecule-ai-workspace-runtime/${RUNTIME_VERSION}/json" >/dev/null 2>&1; then
-              echo "::notice::✓ PyPI serving ${RUNTIME_VERSION} after ${i} polls"
-              exit 0
+            # Stage (a): can pip resolve and install the version?
+            if $PROBE/pip install \
+                  --quiet \
+                  --no-cache-dir \
+                  --force-reinstall \
+                  --no-deps \
+                  "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
+                  >/dev/null 2>&1; then
+              INSTALLED=$($PROBE/pip show molecule-ai-workspace-runtime 2>/dev/null \
+                          | awk -F': ' '/^Version:/{print $2}')
+              if [ "$INSTALLED" = "$RUNTIME_VERSION" ]; then
+                # Stage (b): does Fastly serve the bytes we uploaded?
+                # `pip download` writes the actual .whl file to disk so
+                # we can sha256sum it (vs `pip install` which unpacks
+                # and discards).
+                rm -rf /tmp/probe-dl
+                mkdir -p /tmp/probe-dl
+                if $PROBE/pip download \
+                      --quiet \
+                      --no-cache-dir \
+                      --no-deps \
+                      --dest /tmp/probe-dl \
+                      "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
+                      >/dev/null 2>&1; then
+                  WHEEL=$(ls /tmp/probe-dl/*.whl 2>/dev/null | head -1)
+                  if [ -n "$WHEEL" ]; then
+                    ACTUAL=$(sha256sum "$WHEEL" | awk '{print $1}')
+                    if [ "$ACTUAL" = "$EXPECTED_SHA256" ]; then
+                      echo "::notice::✓ pip resolves AND wheel content matches after ${i} poll(s) (sha256=${EXPECTED_SHA256})"
+                      exit 0
+                    fi
+                    # Hash mismatch: PyPI accepted our upload but Fastly
+                    # is serving different bytes under the version's URL.
+                    # Most often this is propagation lag of the BINARY
+                    # surface — the version is resolvable but the wheel
+                    # cache hasn't caught up. Retry.
+                    echo "::warning::poll ${i}: wheel content mismatch (got ${ACTUAL:0:12}…, want ${EXPECTED_SHA256:0:12}…) — Fastly likely still serving stale binary, retrying"
+                  fi
+                fi
+              fi
            fi
-            sleep 2
+            sleep 4
          done
-          echo "::error::PyPI never propagated ${RUNTIME_VERSION} within 60s — refusing to fan out cascade against stale index"
+          echo "::error::pip never resolved molecule-ai-workspace-runtime==${RUNTIME_VERSION} with matching wheel content within ~5 min."
+          echo "::error::Expected wheel SHA256: ${EXPECTED_SHA256}"
+          echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces."
          exit 1

      - name: Fan out repository_dispatch
--- a/.github/workflows/runtime-pin-compat.yml
+++ b/.github/workflows/runtime-pin-compat.yml
@ -10,15 +10,29 @@ name: Runtime Pin Compatibility
 #   4. Every tenant workspace crashed; the canary tenant caught it but
 #      only after 5 hours of degraded staging
 #
-# This workflow installs the runtime in a fresh Python venv from PyPI
-# and tries the same import the EC2 user-data does. If pip resolution
-# silently produces a broken combo, this gate fails before the tenant
-# image gets published.
+# This workflow installs the CURRENTLY PUBLISHED runtime from PyPI on
+# top of `workspace/requirements.txt` and smoke-imports. Catches:
+#   - Upstream PyPI yanks
+#   - Bad re-releases of molecule-ai-workspace-runtime
+#   - Already-shipped wheels that stop importing because a transitive
+#     dep moved underneath
+#
+# This is the "PyPI artifact health" half of pin compatibility. The
+# companion workflow `runtime-prbuild-compat.yml` covers the
+# "PR-introduced breakage" half by building the wheel from THIS PR's
+# workspace/ source. Splitting the two means each gets a narrow
+# `paths:` filter — the pypi-latest job no longer fires on doc-only
+# workspace/ edits whose content can't change what's currently on PyPI.

 on:
  push:
    branches: [main, staging]
    paths:
+      # Narrow filter: pypi-latest is sensitive only to changes that
+      # affect what we're INSTALLING (requirements.txt) or WHAT THE
+      # CHECK ITSELF DOES (this workflow file). Edits to workspace/
+      # source code don't change what's on PyPI right now, so they
+      # don't change this gate's verdict.
      - 'workspace/requirements.txt'
      - '.github/workflows/runtime-pin-compat.yml'
  pull_request:
@ -42,8 +56,8 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  default-install:
-    name: Default install + import smoke
+  pypi-latest-install:
+    name: PyPI-latest install + import smoke
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
--- a/.github/workflows/runtime-prbuild-compat.yml
+++ b/.github/workflows/runtime-prbuild-compat.yml
@ -0,0 +1,100 @@
+name: Runtime PR-Built Compatibility
+
+# Companion to `runtime-pin-compat.yml`. That workflow tests what's
+# CURRENTLY PUBLISHED on PyPI; this workflow tests what WOULD BE
+# PUBLISHED if THIS PR merges.
+#
+# Why two workflows: the chicken-and-egg #128 fix added a "PR-built
+# wheel" job to the original runtime-pin-compat.yml, but both jobs
+# shared a `paths:` filter that was the union of their needs
+# (`workspace/**`). That meant the PyPI-latest job ran on every doc
+# edit even though the upstream PyPI artifact can't change with our
+# workspace/ source. Splitting the two means each gets a narrow
+# `paths:` filter that matches the inputs it actually depends on.
+#
+# Catches the failure mode where a PR adds an import requiring a newer
+# SDK than `workspace/requirements.txt` pins:
+#   1. Pip resolves the existing PyPI wheel + the old SDK pin → smoke
+#      passes (it imports the OLD main.py from the wheel, not the PR's
+#      new main.py).
+#   2. Merge → publish-runtime.yml ships a wheel WITH the new import.
+#   3. Tenant images redeploy → all crash on first boot with
+#      ImportError.
+#
+# By building from the PR's source and smoke-importing THAT wheel, we
+# fail at PR-time instead of after publish.
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      # Broad filter: this workflow's verdict can change whenever any
+      # workspace/ source file changes (because the wheel we build is
+      # produced from those files), or when the build script itself
+      # changes (it controls the wheel layout).
+      - 'workspace/**'
+      - 'scripts/build_runtime_package.py'
+      - '.github/workflows/runtime-prbuild-compat.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'workspace/**'
+      - 'scripts/build_runtime_package.py'
+      - '.github/workflows/runtime-prbuild-compat.yml'
+  workflow_dispatch:
+  # Required-check support: when this becomes a branch-protection gate,
+  # merge_group runs let the queue green-check this in addition to PRs.
+  merge_group:
+    types: [checks_requested]
+  # No cron: the same pre-merge run already covered the commit, and
+  # re-running daily wouldn't surface anything new (workspace/ doesn't
+  # change between cron firings unless a PR already passed this gate).
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  local-build-install:
+    # Builds the wheel from THIS PR's workspace/ + scripts/ and tests
+    # IT — the artifact that WOULD be published if this PR merges.
+    name: PR-built wheel + import smoke
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: pip
+          cache-dependency-path: workspace/requirements.txt
+      - name: Install build tooling
+        run: pip install build
+      - name: Build wheel from PR source (mirrors publish-runtime.yml)
+        # Use a fixed test version so the wheel filename is predictable.
+        # Doesn't reach PyPI — this build is local-only for the smoke.
+        # Use the SAME build script with the SAME args as
+        # publish-runtime.yml's build step. The temp dir path differs
+        # (`/tmp/runtime-build` here vs `${{ runner.temp }}/runtime-build`
+        # in publish-runtime.yml — they coincide on ubuntu-latest but
+        # the call sites are not byte-identical). The smoke import is
+        # also intentionally narrower than publish's: this gate exists
+        # to catch SDK-version-import drift specifically; full invariant
+        # coverage lives in publish-runtime.yml's own pre-PyPI smoke.
+        run: |
+          python scripts/build_runtime_package.py \
+            --version "0.0.0.dev0+pin-compat" \
+            --out /tmp/runtime-build
+          cd /tmp/runtime-build && python -m build
+      - name: Install built wheel + workspace requirements
+        run: |
+          python -m venv /tmp/venv-built
+          /tmp/venv-built/bin/pip install --upgrade pip
+          /tmp/venv-built/bin/pip install /tmp/runtime-build/dist/*.whl
+          /tmp/venv-built/bin/pip install -r workspace/requirements.txt
+          /tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
+            | grep -E '^(Name|Version):'
+      - name: Smoke import the PR-built wheel
+        env:
+          WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
+        run: |
+          /tmp/venv-built/bin/python -c "from molecule_runtime.main import main_sync; print('PR-built runtime imports OK')"
--- a/docs/ecosystem-watch.md
+++ b/docs/ecosystem-watch.md
@ -1,6 +1,6 @@
 # Ecosystem Watch — Phase 30 Competitive Tracking
 **Created by:** PMM
-**Date:** 2026-04-21
+**Date:** 2026-04-27
 **Status:** ACTIVE — competitor monitoring in progress
 **Phase:** 30 — Remote Workspaces + Cross-Network Federation

@ -118,7 +118,7 @@ Track competitor releases and market events that affect Phase 30 positioning. En
 - **Check frequency:** Every marketing cycle
 - **Trigger:** Any competitor shipping something that invalidates a Phase 30 positioning claim
 - **File location:** `docs/ecosystem-watch.md` (origin/main)
- **Last updated by:** PMM | 2026-04-23 (LangGraph PRs verified OPEN; new feat PRs #1730/#1702/#1731 logged; release note written)
+- **Last updated by:** PMM | 2026-04-27 (weekly refresh — all competitor versions, stars, and PRs verified current; Clawith v1.9.0 added)

 ---

--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@ -7,20 +7,46 @@ This path is aligned to the current repository and current UI. It gets you from
 - Docker + Docker Compose v2
 - Node.js 20+
 - Go 1.25+
+- `jq` (for the template-registry clone in `setup.sh`)
 - One model/API key for the runtime you want to use
  - `ANTHROPIC_API_KEY`
  - `OPENAI_API_KEY`
  - `GOOGLE_API_KEY`
  - or another provider routed through LiteLLM

-## Step 1: Clone the repository
+## The one-command path
+
+```bash
+git clone https://github.com/Molecule-AI/molecule-monorepo.git
+cd molecule-monorepo
+./scripts/dev-start.sh
+```
+
+That single script:
+
+1. Generates an `ADMIN_TOKEN` into `.env` (first run only — preserved on re-runs)
+2. Brings up Postgres, Redis, Langfuse, ClickHouse, and Temporal via `infra/scripts/setup.sh`
+3. Populates the workspace template + plugin registry from `manifest.json`
+4. Builds and starts the platform on `http://localhost:8080`
+5. Installs canvas deps (first run) and starts the canvas on `http://localhost:3000`
+6. Prints next-step instructions and tails both processes — `Ctrl-C` tears everything down
+
+Total wall-clock: ~30 seconds for a re-run, ~2 minutes for a first run (npm install + docker pulls).
+
+Once the canvas is up: open it, add your model API key in **Config → Secrets & API Keys → Global**, then click a template card or **+ Create blank workspace**.
+
+## Manual setup (advanced)
+
+If you'd rather run each component yourself — useful when you're iterating on the platform binary or the canvas in isolation — follow the steps below. Each section is what `dev-start.sh` does internally; running them by hand gives you per-component logs and lets you keep one piece running while you restart another.
+
+### Step 1: Clone the repository

 ```bash
 git clone https://github.com/Molecule-AI/molecule-monorepo.git
 cd molecule-monorepo
 ```

-## Step 2: Start the shared infrastructure
+### Step 2: Start the shared infrastructure

 Recommended:

@ -28,7 +54,7 @@ Recommended:
 ./infra/scripts/setup.sh
 ```

-That brings up Postgres, Redis, and Langfuse.
+That brings up Postgres, Redis, Langfuse, ClickHouse, and Temporal.

 If you only want the raw compose flow:

@ -36,7 +62,7 @@ If you only want the raw compose flow:
 docker compose -f docker-compose.infra.yml up -d
 ```

-## Step 3: Start the platform
+### Step 3: Start the platform

 ```bash
 cd workspace-server
@ -45,7 +71,7 @@ go run ./cmd/server

 The control plane listens on `http://localhost:8080`.

-## Step 4: Start the canvas
+### Step 4: Start the canvas

 In a new terminal:

--- a/scripts/dev-start.sh
+++ b/scripts/dev-start.sh
@ -1,69 +1,182 @@
 #!/bin/sh
 # dev-start.sh — one-command local development environment.
 #
-# Starts: Postgres, Redis, Platform (Go :8080), Canvas (Next.js :3000)
-# Stops all on Ctrl-C.
+# What it does (in order):
+#   1. Generates ADMIN_TOKEN into .env if missing (closes #684 fail-open)
+#   2. Runs infra/scripts/setup.sh (postgres + redis + langfuse + clickhouse
+#      + temporal + populates template/plugin registry from manifest.json)
+#   3. Starts the platform (Go :8080), waits for /health
+#   4. Starts the canvas (Next.js :3000), waits for HTTP 200
+#   5. Prints a readiness banner with API-key add instructions
+#   6. On Ctrl-C, kills both background processes and tears down infra
 #
 # Prerequisites:
-#   - Docker (for Postgres + Redis)
-#   - Go 1.25+ (for platform)
-#   - Node.js 20+ (for canvas)
+#   - Docker + Docker Compose v2  (for postgres/redis/langfuse/etc)
+#   - Go 1.25+                     (for the platform binary)
+#   - Node.js 20+                  (for the canvas)
+#   - jq                           (for setup.sh's manifest clone — optional;
+#                                   without it, template palette will be
+#                                   empty until you run clone-manifest.sh
+#                                   manually)
 #
 # Usage:
 #   ./scripts/dev-start.sh
-#   # Open http://localhost:3000
+#   # Open http://localhost:3000, add your model API key in
+#   # Config → Secrets & API Keys, then create your first workspace.
+#
+# Idempotent: re-running picks up where the last run left off (existing
+# .env is preserved, npm install skipped if node_modules present, etc).

 set -e

 ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+ENV_FILE="$ROOT/.env"

 cleanup() {
    echo ""
-    echo "Shutting down..."
+    echo "==> Shutting down..."
    kill $PLATFORM_PID $CANVAS_PID 2>/dev/null || true
+    # Use setup.sh's compose file (full infra) since that's what we
+    # brought up. `down` keeps named volumes by default — call with
+    # --volumes here only if you want a clean slate (we don't, since
+    # idempotent re-runs are the usual case).
    docker compose -f "$ROOT/docker-compose.infra.yml" down 2>/dev/null || true
-    echo "Done."
+    echo "    Done."
 }
 trap cleanup EXIT INT TERM

-echo "==> Starting infrastructure (Postgres, Redis)..."
-docker compose -f "$ROOT/docker-compose.infra.yml" up -d
+# ─────────────────────────────────────────────── 1. dev-mode auth posture

-echo "==> Waiting for Postgres..."
-until docker compose -f "$ROOT/docker-compose.infra.yml" exec -T postgres pg_isready -q 2>/dev/null; do
-    sleep 1
-done
-echo "    Postgres ready."
+# The AdminAuth middleware closes its fail-open the moment the first
+# workspace token lands in the DB — at which point /workspaces and
+# other admin routes 401 unless the caller has either ADMIN_TOKEN or
+# the dev-mode escape hatch. The canvas at localhost:3000 has no
+# bearer token to send, so without one of those two paths it can't
+# call admin endpoints after a workspace exists.
+#
+# For local dev the right posture is the dev-mode escape hatch:
+#
+#   MOLECULE_ENV=development AND ADMIN_TOKEN unset
+#
+# That makes middleware.isDevModeFailOpen() return true and lets the
+# canvas keep working without a bearer. Setting ADMIN_TOKEN here
+# would BREAK the canvas (it has no way to read that token in dev).
+#
+# For SaaS the platform is provisioned with ADMIN_TOKEN set AND
+# MOLECULE_ENV=production — either one closes the hatch. So the dev
+# mode signal here is safe (it's only active when both other knobs
+# are absent).
+if [ -f "$ENV_FILE" ] && grep -q '^MOLECULE_ENV=' "$ENV_FILE"; then
+    echo "==> Reusing MOLECULE_ENV from existing .env"
+else
+    echo "==> Setting MOLECULE_ENV=development in .env (dev-mode auth hatch)"
+    {
+        if [ -f "$ENV_FILE" ]; then
+            cat "$ENV_FILE"
+            echo ""
+        fi
+        echo "# Generated by scripts/dev-start.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)"
+        echo "# Local-dev auth posture: dev-mode fail-open lets the canvas at"
+        echo "# localhost:3000 call admin endpoints without a bearer token."
+        echo "# DO NOT set ADMIN_TOKEN here in dev — it would close the hatch"
+        echo "# and the canvas would 401 on every admin call."
+        echo "MOLECULE_ENV=development"
+    } > "$ENV_FILE.tmp"
+    mv "$ENV_FILE.tmp" "$ENV_FILE"
+    echo "    Saved to $ENV_FILE"
+fi

-echo "==> Starting Platform (Go :8080)..."
+# Source .env so the platform inherits ADMIN_TOKEN (and anything else
+# the user has added — e.g. ANTHROPIC_API_KEY for skipping the canvas
+# Secrets UI). `set -a` exports every assignment in the sourced file
+# without us having to know the var names.
+set -a
+# shellcheck disable=SC1090
+. "$ENV_FILE"
+set +a
+
+# ─────────────────────────────────────────────── 2. infra + templates
+
+# Use setup.sh (not raw docker-compose) so the template registry gets
+# populated from manifest.json. Without that, the canvas template
+# palette is empty and the user has to manually clone repos — exactly
+# the friction this script exists to eliminate.
+echo "==> Running infra/scripts/setup.sh (infra + template registry)"
+"$ROOT/infra/scripts/setup.sh"
+
+# ─────────────────────────────────────────────── 3. platform
+
+echo "==> Starting Platform (Go :8080)"
 cd "$ROOT/workspace-server"
-go run ./cmd/server &
+go run ./cmd/server > /tmp/molecule-platform.log 2>&1 &
 PLATFORM_PID=$!

-echo "==> Waiting for Platform health..."
-until curl -sf http://localhost:8080/health >/dev/null 2>&1; do
+echo "    Waiting for Platform /health..."
+PLATFORM_READY=0
+for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
+         21 22 23 24 25 26 27 28 29 30; do
+    if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
+        echo "    Platform ready (t+${i}s)"
+        PLATFORM_READY=1
+        break
+    fi
    sleep 1
 done
-echo "    Platform ready."
+if [ "$PLATFORM_READY" -ne 1 ]; then
+    echo "    ✗ Platform did not respond in 30s — check /tmp/molecule-platform.log"
+    exit 1
+fi

-echo "==> Starting Canvas (Next.js :3000)..."
+# ─────────────────────────────────────────────── 4. canvas
+
+echo "==> Starting Canvas (Next.js :3000)"
 cd "$ROOT/canvas"
 if [ ! -d node_modules ]; then
+    echo "    First-run: npm install (~30-60s)"
    npm install
 fi
-npm run dev &
+npm run dev > /tmp/molecule-canvas.log 2>&1 &
 CANVAS_PID=$!

-echo ""
-echo "============================================"
-echo "  Molecule AI dev environment running"
-echo ""
-echo "  Canvas:   http://localhost:3000"
-echo "  Platform: http://localhost:8080"
-echo "  Postgres: localhost:5432"
-echo "  Redis:    localhost:6379"
-echo ""
-echo "  Press Ctrl-C to stop all services"
-echo "============================================"
+echo "    Waiting for Canvas HTTP 200..."
+CANVAS_READY=0
+for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \
+         21 22 23 24 25 26 27 28 29 30; do
+    code=$(curl -sf -o /dev/null -w "%{http_code}" http://localhost:3000/ 2>/dev/null || echo "0")
+    if [ "$code" = "200" ]; then
+        echo "    Canvas ready (t+${i}s)"
+        CANVAS_READY=1
+        break
+    fi
+    sleep 1
+done
+if [ "$CANVAS_READY" -ne 1 ]; then
+    echo "    ✗ Canvas did not respond in 30s — check /tmp/molecule-canvas.log"
+    exit 1
+fi
+
+# ─────────────────────────────────────────────── 5. readiness banner
+
+cat <<EOF
+
+═══════════════════════════════════════════════════════════
+  Molecule AI dev environment ready
+
+  Canvas:   http://localhost:3000
+  Platform: http://localhost:8080
+  Logs:     /tmp/molecule-platform.log
+            /tmp/molecule-canvas.log
+
+  Next steps:
+    1. Open http://localhost:3000 in a browser.
+    2. Add your model API key in
+         Config → Secrets & API Keys → Global
+       (skip if ANTHROPIC_API_KEY / OPENAI_API_KEY is already
+        set in .env — the platform inherits it.)
+    3. Click a template card or "+ Create blank workspace".
+
+  Press Ctrl-C to stop all services.
+═══════════════════════════════════════════════════════════
+EOF

 wait
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@ -86,24 +86,47 @@ cleanup_org() {
  fi

  log "🧹 Tearing down org $SLUG..."
-  curl "${CURL_COMMON[@]}" -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
+
+  # The DELETE handler runs the GDPR Art. 17 cascade synchronously
+  # (Stripe + Redis + EC2 terminate + CF tunnel + DNS + DB rows). Real
+  # observed wall-time on prod-shaped infra is ~30–90s — EC2 termination
+  # alone takes 30–60s. The 5–15s estimate in `purge.go`'s comment is
+  # the API-call cost, NOT the AWS-side time-to-termination it waits on.
+  #
+  # Two-part patience to match reality:
+  #   1. 120s curl timeout on the DELETE itself (was 30s) so the
+  #      synchronous cascade has room to complete in-band.
+  #   2. Poll up to 60s after for organizations.status='purged' (or row
+  #      gone) instead of one rigid 10s sleep — covers the case where
+  #      DELETE returns 5xx mid-cascade and the cascade finishes anyway,
+  #      and the case where DELETE legitimately exceeds 120s and we want
+  #      eventual-consistency confirmation.
+  curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
    -H "Authorization: Bearer $ADMIN_TOKEN" \
    -H "Content-Type: application/json" \
    -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \
    && ok "Teardown request accepted" \
    || log "Teardown returned non-2xx (may already be gone)"

-  sleep 10
-  local leak_count
-  leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
-    -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
-    | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
-    2>/dev/null || echo 0)
+  local leak_count=1
+  local elapsed=0
+  while [ "$elapsed" -lt 60 ]; do
+    leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
+      -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+      | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
+      2>/dev/null || echo 1)
+    if [ "$leak_count" = "0" ]; then
+      break
+    fi
+    sleep 5
+    elapsed=$((elapsed + 5))
+  done
+
  if [ "$leak_count" != "0" ]; then
-    echo "⚠️  LEAK: org $SLUG still present post-teardown (count=$leak_count)" >&2
+    echo "⚠️  LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2
    exit 4
  fi
-  ok "Teardown clean — no orphan resources for $SLUG"
+  ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"

  # Normalize unexpected upstream exit codes to 1 (generic failure). The
  # script's documented contract (header "Exit codes" section) only emits
--- a/workspace-server/internal/registry/orphan_sweeper.go
+++ b/workspace-server/internal/registry/orphan_sweeper.go
@ -112,12 +112,15 @@ func sweepOnce(parent context.Context, reaper OrphanReaper) {
 	ctx, cancel := context.WithTimeout(parent, orphanSweepDeadline)
 	defer cancel()

-	// Two independent passes. Each handles its own short-circuit; an
-	// empty result or transient error in one must NOT stop the other,
+	// Three independent passes. Each handles its own short-circuit; an
+	// empty result or transient error in one must NOT stop the others,
 	// since the wiped-DB pass exists precisely for cases where the
-	// removed-row pass finds zero candidates (DB has been dropped).
+	// removed-row pass finds zero candidates (DB has been dropped) and
+	// the stale-token pass exists for the mirror case (DB persists but
+	// /configs volume has been wiped).
 	sweepRemovedRows(ctx, reaper)
 	sweepLabeledOrphansWithoutRows(ctx, reaper)
+	sweepStaleTokensWithoutContainer(ctx, reaper)
 }

 // sweepRemovedRows is the original sweep: ws-* containers (by name
@ -290,3 +293,181 @@ func sweepLabeledOrphansWithoutRows(ctx context.Context, reaper OrphanReaper) {
 		}
 	}
 }
+
+// staleTokenGrace bounds how recently a token must have been used (or
+// issued, if never used) for it to be considered "potentially live".
+// Anything quieter than this is fair game for the stale-token revoke
+// pass when there's no matching container.
+//
+// Sized vs the heartbeat cadence (30s) and provisioning latency: a
+// healthy workspace touches `last_used_at` every heartbeat, so 5min is
+// 10× the heartbeat interval — enough headroom that brief container
+// restarts (Stop → Start) don't trip the pass. A workspace that's been
+// silent past this window AND has no container is either a wiped-volume
+// orphan or a workspace nobody is using; either way, revoking is safe
+// because the next /registry/register mints a fresh token via the
+// no-live-tokens bootstrap branch in registry.go.
+const staleTokenGrace = 5 * time.Minute
+
+// sweepStaleTokensWithoutContainer revokes workspace_auth_tokens rows
+// for workspaces whose /configs volume must have been wiped — detected
+// as "live token in DB whose owning workspace has no live Docker
+// container". This heals the user-reported failure mode where
+// `docker compose down -v` (or any out-of-band volume removal) leaves
+// stale tokens in the DB while the recreated container has an empty
+// `/configs/.auth_token`. Without this pass, /registry/register on the
+// fresh container 401s forever (requireWorkspaceToken sees live tokens,
+// container can't present one), and the workspace is permanently
+// wedged until an operator manually revokes via SQL.
+//
+// The platform's restart endpoint already handles this case correctly
+// via wsauth.RevokeAllForWorkspace inside issueAndInjectToken — this
+// pass is the safety net for the equivalent action taken outside the
+// API (operator did `docker compose down -v`, host crashed mid-restart,
+// disk pressure evicted a volume, etc).
+//
+// Safety filters that bound the revoke radius:
+//
+//  1. Only runs in single-tenant Docker mode. The orphan sweeper is
+//     wired only when prov != nil (see cmd/server/main.go) — in CP/SaaS
+//     mode there is no Docker daemon and the sweeper doesn't run, so an
+//     empty container list cannot be confused with "no Docker at all"
+//     here (which would otherwise revoke every workspace's tokens).
+//     The function also short-circuits on a nil reaper as a belt-and-
+//     braces guard against a future refactor wiring it incorrectly.
+//
+//  2. staleTokenGrace skips tokens that were issued or used in the
+//     last 5 minutes. Bounds the race with mid-provisioning (token
+//     issued moments before docker run completes) and brief restart
+//     windows.
+//
+//  3. CRITICAL: the staleness predicate is enforced AT THE UPDATE,
+//     not just at the SELECT. This closes a TOCTOU race against
+//     workspace_provision.go:issueAndInjectToken — the platform's
+//     restart endpoint Stops the container synchronously then dispatches
+//     re-provisioning to a goroutine, so a stale-on-SELECT workspace
+//     can have a fresh token inserted by issueAndInjectToken between
+//     our SELECT and our UPDATE. A predicate-only `WHERE workspace_id
+//     = $1 AND revoked_at IS NULL` UPDATE would catch that fresh token
+//     too. Carrying COALESCE(last_used_at, created_at) < now() - grace
+//     in the UPDATE makes the operation idempotent against fresh
+//     inserts: a token created within the grace window cannot match.
+//
+//  4. The DB query joins on workspaces.status NOT IN ('removed',
+//     'provisioning') so deleted and mid-restart workspaces are not
+//     revoked here — those are handled at delete time and by
+//     issueAndInjectToken respectively. (`status = 'provisioning'` is
+//     set synchronously in workspace_restart.go before the async
+//     re-provision begins, so it's a reliable in-flight signal.)
+//
+//  5. Each revocation is logged with the workspace ID so operators can
+//     correlate "workspace just lost auth" with this sweeper, not blame
+//     a network blip.
+//
+// Failure mode: revoke fails for some reason (transient DB error). The
+// next sweep cycle (60s out) retries. Worst case: a workspace stays
+// 401-blocked an extra minute.
+func sweepStaleTokensWithoutContainer(ctx context.Context, reaper OrphanReaper) {
+	// Defence-in-depth (F2): a future refactor that wires the sweeper
+	// in CP/SaaS mode without checking prov would otherwise hit this
+	// pass with a nil reaper. The StartOrphanSweeper entry point
+	// already short-circuits on nil, but we don't want to depend on
+	// every future caller doing the same.
+	if reaper == nil {
+		return
+	}
+
+	prefixes, err := reaper.ListWorkspaceContainerIDPrefixes(ctx)
+	if err != nil {
+		log.Printf("Orphan sweeper: ListWorkspaceContainerIDPrefixes failed: %v — skipping stale-token pass", err)
+		return
+	}
+
+	// Same hex-and-dash filter as the other passes — anything that
+	// can't be a workspace UUID prefix doesn't belong in a SQL LIKE
+	// pattern.
+	//
+	// NOTE: an empty `likes` array is intentionally NOT a short-circuit.
+	// "No workspace containers" is the load-bearing case for this pass
+	// (operator nuked everything). The `cardinality($1) = 0` clause in
+	// the SELECT below treats empty likes as "no LIKE filter" → every
+	// stale-token workspace becomes a candidate. The first two passes'
+	// early-return-on-empty-prefixes pattern would defeat this entire
+	// pass's purpose.
+	likes := make([]string, 0, len(prefixes))
+	for _, p := range prefixes {
+		if !isLikelyWorkspaceID(p) {
+			continue
+		}
+		likes = append(likes, p+"%")
+	}
+
+	// Find workspaces with live tokens whose most-recent activity is
+	// past the grace window AND whose ID does NOT match any live
+	// container prefix. When `likes` is empty (no workspace containers
+	// running at all), every stale-activity workspace is a candidate —
+	// expressed via the `cardinality($1) = 0` short-circuit so the
+	// query has a single shape regardless of container count.
+	//
+	// make_interval(secs => $2) avoids the time.Duration.String() →
+	// `"5m0s"` mismatch with Postgres interval grammar; passing seconds
+	// as an int keeps the binding portable.
+	graceSeconds := int(staleTokenGrace.Seconds())
+	rows, qErr := db.DB.QueryContext(ctx, `
+		SELECT DISTINCT t.workspace_id::text
+		  FROM workspace_auth_tokens t
+		  JOIN workspaces w ON w.id = t.workspace_id
+		 WHERE t.revoked_at IS NULL
+		   AND w.status NOT IN ('removed', 'provisioning')
+		   AND COALESCE(t.last_used_at, t.created_at) < now() - make_interval(secs => $2)
+		   AND (
+		         cardinality($1::text[]) = 0
+		      OR NOT (t.workspace_id::text LIKE ANY($1::text[]))
+		   )
+	`, pq.Array(likes), graceSeconds)
+	if qErr != nil {
+		log.Printf("Orphan sweeper: stale-token query failed: %v — skipping stale-token pass", qErr)
+		return
+	}
+	defer rows.Close()
+
+	var staleWorkspaceIDs []string
+	for rows.Next() {
+		var id string
+		if scanErr := rows.Scan(&id); scanErr != nil {
+			log.Printf("Orphan sweeper: stale-token row scan failed: %v", scanErr)
+			continue
+		}
+		staleWorkspaceIDs = append(staleWorkspaceIDs, id)
+	}
+	if iterErr := rows.Err(); iterErr != nil {
+		log.Printf("Orphan sweeper: stale-token rows iteration failed: %v", iterErr)
+		return
+	}
+
+	// Per-workspace UPDATE with the SAME staleness predicate as the
+	// SELECT, so any token inserted between SELECT and UPDATE (e.g.
+	// issueAndInjectToken racing during a user-triggered restart of a
+	// long-idle workspace) is automatically excluded — its created_at
+	// is fresh and won't satisfy `< now() - grace`.
+	//
+	// We deliberately bypass wsauth.RevokeAllForWorkspace here because
+	// that helper revokes EVERY live token for the workspace; we want
+	// "every STALE live token", which is a different (safer) operation.
+	for _, wsID := range staleWorkspaceIDs {
+		log.Printf("Orphan sweeper: revoking stale tokens for workspace %s (no live container; volume likely wiped)", wsID)
+		_, revokeErr := db.DB.ExecContext(ctx, `
+			UPDATE workspace_auth_tokens
+			   SET revoked_at = now()
+			 WHERE workspace_id = $1
+			   AND revoked_at IS NULL
+			   AND COALESCE(last_used_at, created_at) < now() - make_interval(secs => $2)
+		`, wsID, graceSeconds)
+		if revokeErr != nil {
+			// Non-fatal — next sweep retries. Bail on the loop so a
+			// systemic DB error doesn't spam the log on every iteration.
+			log.Printf("Orphan sweeper: stale-token revoke for %s failed: %v — will retry next cycle", wsID, revokeErr)
+			return
+		}
+	}
+}
--- a/workspace-server/internal/registry/orphan_sweeper_test.go
+++ b/workspace-server/internal/registry/orphan_sweeper_test.go
@ -10,6 +10,24 @@ import (
 	"github.com/DATA-DOG/go-sqlmock"
 )

+// expectStaleTokenSweepNoOp registers the third-pass query
+// (sweepStaleTokensWithoutContainer) returning zero rows. The third
+// pass runs unconditionally on every sweepOnce, so every test that
+// doesn't specifically exercise stale-token revocation must register
+// this expectation or sqlmock will fail "unexpected query".
+//
+// Centralising the regex here keeps the existing test suite readable —
+// individual tests don't have to spell out a query they're not actually
+// asserting against.
+//
+// The regex is anchored at the start of the query AND requires the
+// status-filter to keep us from accidentally matching a future query
+// that opens with the same column name. R3 from the review.
+func expectStaleTokenSweepNoOp(mock sqlmock.Sqlmock) {
+	mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
+		WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}))
+}
+
 // fakeReaper is a hand-rolled OrphanReaper for the sweeper tests.
 // Records every Stop / RemoveVolume call so tests can assert which
 // workspace IDs got reconciled.
@ -73,6 +91,7 @@ func TestSweepOnce_ReconcilesRunningRemovedRows(t *testing.T) {
 	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
 		WillReturnRows(sqlmock.NewRows([]string{"id"}).
 			AddRow("abc123def456-0000-0000-0000-000000000000"))
+	expectStaleTokenSweepNoOp(mock)

 	sweepOnce(context.Background(), reaper)

@ -96,8 +115,11 @@ func TestSweepOnce_NoRunningContainers(t *testing.T) {

 	reaper := &fakeReaper{listResponse: nil}

-	// No DB query expected — if sweepOnce makes one anyway the
-	// sqlmock will fail "unexpected query".
+	// First two passes short-circuit on empty container lists. The
+	// third pass (stale-token sweep) DOES query — that's its whole
+	// reason for existing in the no-containers case (operator nuked
+	// everything). Mock it returning no stale tokens.
+	expectStaleTokenSweepNoOp(mock)
 	sweepOnce(context.Background(), reaper)

 	if len(reaper.stopCalls) != 0 {
@ -145,6 +167,7 @@ func TestSweepOnce_StopFailureLeavesVolume(t *testing.T) {
 	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
 		WillReturnRows(sqlmock.NewRows([]string{"id"}).
 			AddRow("abc123def456-0000-0000-0000-000000000000"))
+	expectStaleTokenSweepNoOp(mock)

 	sweepOnce(context.Background(), reaper)

@ -173,6 +196,7 @@ func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) {
 		WillReturnRows(sqlmock.NewRows([]string{"id"}).
 			AddRow("aaa111bbb222-0000-0000-0000-000000000000").
 			AddRow("ccc333ddd444-0000-0000-0000-000000000000"))
+	expectStaleTokenSweepNoOp(mock)

 	sweepOnce(context.Background(), reaper)

@ -205,9 +229,12 @@ func TestSweepOnce_FiltersNonWorkspacePrefixes(t *testing.T) {
 		},
 	}

-	// No DB query expected — every prefix is rejected before the
-	// query builds, so we short-circuit. sqlmock fails on any
-	// unexpected query.
+	// First-pass query is skipped — every prefix is rejected before
+	// the query builds. Third-pass query still runs (filtered prefixes
+	// + non-empty input list still produces an empty likes array,
+	// which the third-pass treats the same as "no containers running"
+	// → stale-token candidates with no LIKE filter). Mock it empty.
+	expectStaleTokenSweepNoOp(mock)
 	sweepOnce(context.Background(), reaper)

 	if len(reaper.stopCalls) != 0 {
@ -289,6 +316,7 @@ func TestSweepOnce_WipedDBReapsLabeledOrphans(t *testing.T) {
 	// returns no rows — both prefixes are unknown.
 	mock.ExpectQuery(`SELECT lk\s+FROM unnest`).
 		WillReturnRows(sqlmock.NewRows([]string{"lk"}))
+	expectStaleTokenSweepNoOp(mock)

 	sweepOnce(context.Background(), reaper)

@ -328,6 +356,7 @@ func TestSweepOnce_WipedDBSkipsLabeledContainersWithRows(t *testing.T) {
 		WillReturnRows(sqlmock.NewRows([]string{"lk"}).
 			AddRow("abc123def456%").
 			AddRow("ee0011223344%"))
+	expectStaleTokenSweepNoOp(mock)

 	sweepOnce(context.Background(), reaper)

@ -355,6 +384,7 @@ func TestSweepOnce_WipedDBReapsOnlyTheUnknownOnes(t *testing.T) {
 	mock.ExpectQuery(`SELECT lk\s+FROM unnest`).
 		WillReturnRows(sqlmock.NewRows([]string{"lk"}).
 			AddRow(keep + "%"))
+	expectStaleTokenSweepNoOp(mock)

 	sweepOnce(context.Background(), reaper)

@ -380,7 +410,10 @@ func TestSweepOnce_WipedDBSkippedOnDockerError(t *testing.T) {
 	}

 	// No DB query expected for the second pass since we error out
-	// before reaching SQL.
+	// before reaching SQL. The third pass (stale-token sweep) uses
+	// ListWorkspaceContainerIDPrefixes (which succeeded with empty
+	// here, not the same call that errored), so it DOES query.
+	expectStaleTokenSweepNoOp(mock)
 	sweepOnce(context.Background(), reaper)

 	if len(reaper.stopCalls) != 0 {
@ -410,6 +443,7 @@ func TestSweepOnce_WipedDBSkipsNonUUIDPrefixes(t *testing.T) {
 	// that should appear in the unnest array.
 	mock.ExpectQuery(`SELECT lk\s+FROM unnest`).
 		WillReturnRows(sqlmock.NewRows([]string{"lk"}))
+	expectStaleTokenSweepNoOp(mock)

 	sweepOnce(context.Background(), reaper)

@ -420,3 +454,204 @@ func TestSweepOnce_WipedDBSkipsNonUUIDPrefixes(t *testing.T) {
 		t.Errorf("unmet sqlmock expectations: %v", err)
 	}
 }
+
+// =============================================================================
+// Third pass: sweepStaleTokensWithoutContainer
+//
+// Heals the user-reported "auth token conflict after volume wipe" failure mode.
+// Scenario: operator runs `docker compose down -v` (or any out-of-band volume
+// removal); DB still has tokens for workspaces whose recreated containers boot
+// with empty /configs and 401 forever on /registry/register.
+// =============================================================================
+
+// TestSweepOnce_StaleTokenRevokeFiresWhenNoContainer — the headline
+// case. A workspace has live tokens in the DB but no live container
+// matches it (volume-wipe scenario). The third pass revokes.
+func TestSweepOnce_StaleTokenRevokeFiresWhenNoContainer(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// Two name-shaped containers running, both have status='removed'
+	// rows so first pass reaps them. Second pass finds nothing
+	// (managed list empty in this scenario). Third pass: even though
+	// the running containers cover those two prefixes, an unrelated
+	// workspace ID (no live container, no name prefix match) has
+	// stale tokens — revoke it.
+	const orphanedID = "deadbeef-0000-0000-0000-000000000000"
+	reaper := &fakeReaper{listResponse: []string{"abc123def456"}}
+
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).
+			AddRow("abc123def456-0000-0000-0000-000000000000"))
+
+	// Third-pass query returns the orphaned workspace.
+	// Tight regex pins the safety guards: status-filter excludes
+	// 'removed' and 'provisioning' (R2 + the C1 fix), and the
+	// staleness predicate appears in the SELECT.
+	mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*COALESCE\(t\.last_used_at, t\.created_at\) < now\(\) - make_interval`).
+		WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).
+			AddRow(orphanedID))
+
+	// Revoke executes one UPDATE — and the UPDATE itself MUST also
+	// carry the staleness predicate (closes the C1 TOCTOU race
+	// against issueAndInjectToken inserting a fresh token between
+	// our SELECT and our UPDATE).
+	mock.ExpectExec(`(?s)UPDATE workspace_auth_tokens\s+SET revoked_at = now\(\)\s+WHERE workspace_id = \$1\s+AND revoked_at IS NULL\s+AND COALESCE\(last_used_at, created_at\) < now\(\) - make_interval`).
+		WithArgs(orphanedID, 300).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	sweepOnce(context.Background(), reaper)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_StaleTokenSkippedWhenContainerExists — pin the safety
+// guarantee: a workspace with both live tokens AND a live container
+// must NOT be revoked. The query's NOT LIKE clause is the gate; this
+// test exercises that gate by having the third-pass query return zero
+// rows (the live-container workspace is filtered out).
+func TestSweepOnce_StaleTokenSkippedWhenContainerExists(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// One running container; first pass returns no removed-row matches.
+	reaper := &fakeReaper{listResponse: []string{"abc123def456"}}
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}))
+
+	// Third-pass query: the live workspace has live tokens but its
+	// prefix matches the running container, so the NOT LIKE excludes
+	// it. Result: zero stale tokens.
+	expectStaleTokenSweepNoOp(mock)
+
+	sweepOnce(context.Background(), reaper)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_StaleTokenRevokeFailureBailsLoop — a transient DB
+// error during revoke must not spam the log on every iteration.
+// Bail out of the loop; next 60s cycle retries.
+func TestSweepOnce_StaleTokenRevokeFailureBailsLoop(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{listResponse: nil}
+
+	// Third-pass returns two stale-token workspaces; the first revoke
+	// errors. Loop must bail without attempting the second.
+	mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
+		WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).
+			AddRow("aaaa1111-0000-0000-0000-000000000000").
+			AddRow("bbbb2222-0000-0000-0000-000000000000"))
+	mock.ExpectExec(`(?s)UPDATE workspace_auth_tokens\s+SET revoked_at = now\(\)\s+WHERE workspace_id = \$1\s+AND revoked_at IS NULL\s+AND COALESCE\(last_used_at, created_at\) < now\(\) - make_interval`).
+		WithArgs("aaaa1111-0000-0000-0000-000000000000", 300).
+		WillReturnError(errors.New("connection reset"))
+	// No second ExpectExec: if the loop tries it, sqlmock fails
+	// "unexpected call".
+
+	sweepOnce(context.Background(), reaper)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_StaleTokenQueryErrorIsNonFatal — a transient DB error
+// on the SELECT must not prevent the rest of sweepOnce from making
+// progress. (In this test there's no other progress to make either,
+// just verifying no panic + the cycle completes.)
+func TestSweepOnce_StaleTokenQueryErrorIsNonFatal(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{listResponse: nil}
+
+	mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
+		WillReturnError(errors.New("connection reset"))
+
+	sweepOnce(context.Background(), reaper)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_StaleTokenRevokeUsesStalenessPredicate — pin the C1
+// race fix: the per-workspace UPDATE must carry the staleness
+// predicate so a token inserted by issueAndInjectToken between our
+// SELECT and our UPDATE is automatically excluded (its created_at is
+// fresh and won't satisfy `< now() - grace`).
+//
+// This test asserts the SHAPE of the UPDATE (predicate present, grace
+// argument bound). A real-Postgres integration test would prove the
+// race resolution end-to-end; this catches the regression where
+// someone "simplifies" the UPDATE back to a predicate-only revoke.
+func TestSweepOnce_StaleTokenRevokeUsesStalenessPredicate(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	const orphanedID = "deadbeef-0000-0000-0000-000000000000"
+	reaper := &fakeReaper{listResponse: nil}
+
+	mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
+		WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).
+			AddRow(orphanedID))
+
+	// The UPDATE regex requires every guard: workspace_id binding,
+	// revoked_at IS NULL, AND the staleness predicate using the SAME
+	// COALESCE expression as the SELECT. Loosening any of these
+	// would re-open the C1 race, and this regex would no longer match.
+	mock.ExpectExec(`(?s)UPDATE workspace_auth_tokens\s+SET revoked_at = now\(\)\s+WHERE workspace_id = \$1\s+AND revoked_at IS NULL\s+AND COALESCE\(last_used_at, created_at\) < now\(\) - make_interval\(secs => \$2\)`).
+		WithArgs(orphanedID, 300).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	sweepOnce(context.Background(), reaper)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepStaleTokens_NilReaperEarlyExit — defence-in-depth (F2):
+// even though StartOrphanSweeper short-circuits on nil reaper, the
+// individual pass also early-exits. Protects against future refactors
+// that wire the pass without the outer guard.
+func TestSweepStaleTokens_NilReaperEarlyExit(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// No DB queries expected. If the early-return is removed, sqlmock
+	// fails on the unexpected SELECT.
+	sweepStaleTokensWithoutContainer(context.Background(), nil)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_StaleTokenSkippedWhenDockerListFails — if the third
+// pass can't enumerate containers (Docker hiccup), it must skip the
+// query entirely. Otherwise it would query with empty likes and
+// revoke every stale-token workspace based on stale information.
+func TestSweepOnce_StaleTokenSkippedWhenDockerListFails(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{listErr: errors.New("daemon unreachable")}
+
+	// No DB queries expected: first pass bails on listErr, second
+	// pass uses managedList (also fails because we never set it),
+	// third pass also bails on listErr. Verify by NOT registering
+	// ExpectStaleTokenSweepNoOp — sqlmock fails on any unexpected
+	// query.
+	sweepOnce(context.Background(), reaper)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
--- a/workspace/main.py
+++ b/workspace/main.py
@ -437,13 +437,23 @@ async def main():  # pragma: no cover
            )
        async def _send_initial_prompt():
            """Wait for server to be ready, then send initial_prompt as self-message."""
-            # Wait for the A2A server to accept connections
+            # Wait for the A2A server to accept connections.
+            # Use the SDK's own constant for the well-known path so this
+            # probe and the route mounted by create_agent_card_routes()
+            # never drift apart. Pre-fix this hardcoded the pre-1.x
+            # well-known path string; a2a-sdk 1.x renamed it (the
+            # canonical value lives in a2a.utils.constants now), so
+            # the probe got 404 every attempt and fell through to
+            # "server not ready after 30s, skipping" even though the
+            # server was actually serving fine. Net effect: every
+            # workspace silently dropped its `initial_prompt`.
+            from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
            ready = False
            for attempt in range(30):
                await asyncio.sleep(1)
                try:
                    async with httpx.AsyncClient(timeout=5.0) as client:
-                        resp = await client.get(f"http://127.0.0.1:{port}/.well-known/agent.json")
+                        resp = await client.get(f"http://127.0.0.1:{port}{AGENT_CARD_WELL_KNOWN_PATH}")
                        if resp.status_code == 200:
                            ready = True
                            break
--- a/workspace/tests/test_agent_card_well_known_path.py
+++ b/workspace/tests/test_agent_card_well_known_path.py
@ -0,0 +1,84 @@
+"""Pin the agent-card readiness probe to the SDK's canonical path.
+
+main.py's _send_initial_prompt() polls the local A2A server's
+well-known agent-card URL to know when it's safe to send the initial
+prompt as a self-message. Pre-fix the URL was hardcoded to the pre-1.x
+literal; a2a-sdk 1.x renamed the well-known path (the canonical value
+lives in `a2a.utils.constants.AGENT_CARD_WELL_KNOWN_PATH`), so the
+probe got 404 every attempt and silently fell through to "server not
+ready after 30s, skipping" — dropping every workspace's
+`initial_prompt` from config.yaml.
+
+The fix is to import the SDK's `AGENT_CARD_WELL_KNOWN_PATH` constant
+and use it directly in the probe URL. These tests pin the static
+invariants of that fix:
+
+  1. No hardcoded `/.well-known/agent.json` literal anywhere in
+     main.py (catches a future contributor reverting to a literal).
+  2. The probe URL fstring interpolates `AGENT_CARD_WELL_KNOWN_PATH`
+     (catches a "fix" that imports the constant for show but still
+     uses a literal in the actual GET).
+
+Note: we deliberately do not assert the constant's value or compare
+it against `create_agent_card_routes()` here. The runtime SDK is
+mocked in this directory's conftest for the executor-test path, so
+any test that imports the real `a2a.utils.constants` would either
+collide with the mock or require running in a separate pytest session.
+The two static invariants are sufficient: by always following whatever
+the SDK constant says, we travel through any rename automatically. The
+SDK's own contract that `create_agent_card_routes` mounts at the
+constant's value is the SDK's responsibility, not ours.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+WORKSPACE_ROOT = Path(__file__).resolve().parents[1]
+
+
+def test_main_uses_sdk_constant_for_agent_card_probe():
+    """No hardcoded `/.well-known/agent.json` literal anywhere in main.py.
+
+    The SDK constant (AGENT_CARD_WELL_KNOWN_PATH) is the single source
+    of truth — string-literal probes drift the moment the SDK renames.
+    """
+    main = (WORKSPACE_ROOT / "main.py").read_text()
+
+    bad_literal = "/.well-known/agent.json"
+    offenders = [
+        (lineno, line)
+        for lineno, line in enumerate(main.splitlines(), 1)
+        if bad_literal in line
+    ]
+    assert not offenders, (
+        f"Found pre-1.x literal {bad_literal!r} in main.py — must use "
+        f"the SDK's AGENT_CARD_WELL_KNOWN_PATH constant instead. "
+        f"Offending lines: {offenders}"
+    )
+
+    assert (
+        "AGENT_CARD_WELL_KNOWN_PATH" in main
+    ), "main.py must import a2a.utils.constants.AGENT_CARD_WELL_KNOWN_PATH"
+
+
+def test_probe_loop_uses_constant_in_url_format():
+    """Spot-check that the URL fstring in main.py interpolates the
+    constant, not a literal. Catches a future "fix" that imports the
+    constant for show but still uses a literal in the actual GET."""
+    main = (WORKSPACE_ROOT / "main.py").read_text()
+
+    # The probe pattern: `client.get(f"http://127.0.0.1:{port}{...}")`
+    # where `{...}` must be `{AGENT_CARD_WELL_KNOWN_PATH}`, not a
+    # hardcoded path.
+    pattern = re.compile(
+        r'client\.get\(f"http://127\.0\.0\.1:\{port\}\{(?P<expr>[^}]+)\}"\)'
+    )
+    matches = pattern.findall(main)
+    assert matches, "no readiness probe pattern found in main.py"
+    for expr in matches:
+        assert "AGENT_CARD_WELL_KNOWN_PATH" in expr, (
+            f"readiness probe URL uses {expr!r} instead of "
+            f"AGENT_CARD_WELL_KNOWN_PATH"
+        )