From eecce56c13ce153b0912ff97c758bcd7a56fdfaa Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 19 Apr 2026 03:37:42 -0700
Subject: [PATCH] feat(canary): rollback-latest script + release-pipeline doc
 (Phase 4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the canary loop with the escape hatch and a single place to
read about the whole flow.

scripts/rollback-latest.sh <sha>
  uses crane to retag :latest ← :staging-<sha> for BOTH the platform
  and tenant images. Pre-checks the target tag exists and verifies
  the :latest digest after the move so a bad ops typo doesn't
  silently promote the wrong thing. Prod tenants auto-update to the
  rolled-back digest within their 5-min cycle. Exit codes: 0 = both
  retagged, 1 = registry/tag error, 2 = usage error.

docs/architecture/canary-release.md
  The one-page map of the pipeline: how PR → main → staging-<sha> →
  canary smoke → :latest promotion works end-to-end, how to add a
  canary tenant, how to roll back, and what this gate explicitly does
  NOT catch (prod-only data, config drift, cross-tenant bugs).

No code changes in the CP or workspace-server — this PR is shell
+ docs only, so it's safe to land independently of the other Phase
{1,1.5,2,3} PRs still in review.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/architecture/canary-release.md | 79 ++++++++++++++++++++++++++++
 scripts/rollback-latest.sh          | 80 +++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 docs/architecture/canary-release.md
 create mode 100755 scripts/rollback-latest.sh
diff --git a/docs/architecture/canary-release.md b/docs/architecture/canary-release.md
new file mode 100644
index 00000000..eb795eda
--- /dev/null
+++ b/docs/architecture/canary-release.md
@@ -0,0 +1,79 @@
+# Canary release pipeline
+
+How a workspace-server code change reaches the prod tenant fleet — and how to stop it if something's wrong.
+
+## The loop
+
+```
+PR merged to staging → main
+      │
+      ▼
+publish-workspace-server-image.yml   ← pushes :staging-<sha> ONLY
+      │                                (NOT :latest — prod is untouched)
+      ▼
+Canary tenants auto-update to :staging-<sha>
+      │   (5-min auto-updater cycle on each canary EC2)
+      ▼
+canary-verify.yml waits 6 min, runs scripts/canary-smoke.sh
+      │
+      ├─► GREEN → crane tag :staging-<sha> → :latest
+      │                                       │
+      │                                       ▼
+      │                           Prod tenants auto-update within 5 min
+      │
+      └─► RED   → :latest stays on prior good digest
+                  GitHub Step Summary flags the rejected sha
+                  Ops fixes forward OR rolls back manually
+```
+
+## Canary fleet
+
+Lives in a separate AWS account (`molecule-canary`, `004947743811`) via an assumed role (`MoleculeStagingProvisioner`). The CP's `is_canary` org flag routes provisioning there; every other org goes to the default staging account. See `docs/architecture/saas-prod-migration-2026-04-19.md` for the account bootstrap.
+
+Canary tenants are configured to pull `:staging-<sha>` (not `:latest`) via `TENANT_IMAGE` on their provisioner, so they ingest each new build before prod does.
+
+## Smoke suite
+
+`scripts/canary-smoke.sh` hits each canary tenant (URL + ADMIN_TOKEN pair) and asserts:
+
+- `/admin/liveness` returns a subsystems map (tenant booted, AdminAuth reachable)
+- `/workspaces` returns a JSON array (wsAuth + DB healthy)
+- `/memories/commit` + `/memories/search` round-trip (encryption + scrubber)
+- `/events` admin read (C4 fail-closed proof)
+- `/admin/liveness` without bearer → 401 (C4 regression gate)
+
+Expand by editing the script — each `check "name" "expected" "$response"` call is one line.
+
+## Adding a canary tenant
+
+1. `POST /cp/orgs` — create the org normally (is_canary defaults to false)
+2. `POST /cp/admin/orgs/<slug>/canary` with `{"is_canary": true}` — admin only, refuses to flip if already provisioned
+3. Re-trigger provision (or delete + recreate if the org was already provisioned into staging) — the fresh EC2 lands in account `004947743811`
+
+Then set repo secrets:
+- `CANARY_TENANT_URLS` — append the new tenant's URL
+- `CANARY_ADMIN_TOKENS` — append its ADMIN_TOKEN in the same position
+
+## Rolling back `:latest`
+
+When canary was green but something surfaces post-promotion, retag `:latest` to a prior digest:
+
+```bash
+export GITHUB_TOKEN=ghp_...    # write:packages
+scripts/rollback-latest.sh 4c1d56e  # retags both platform + tenant images
+```
+
+`scripts/rollback-latest.sh` pre-checks that `:staging-<sha>` exists before moving `:latest`, and verifies the digest after the move. Prod tenants pick up the rolled-back image on their next 5-min auto-update.
+
+A post-mortem should always include:
+- the commit sha that broke
+- why canary didn't catch it (new code path the smoke suite doesn't exercise?)
+- whether the smoke suite should grow a new check to prevent the same class of bug
+
+## What this gate doesn't catch
+
+- Bugs that only surface under prod-only data (customer workloads with scale or shape canary doesn't produce). Canary uses real traffic shapes but can't simulate weeks of accumulated state.
+- Config drift between canary and prod (different env-var values, different feature flags). Keep canary's config deltas minimal and documented.
+- Cross-tenant interactions — canary tenants run in their own AWS account, so a bug that only appears when two tenants compete for a shared resource won't reproduce here.
+
+When these miss, `rollback-latest.sh` is the escape hatch.
diff --git a/scripts/rollback-latest.sh b/scripts/rollback-latest.sh
new file mode 100755
index 00000000..ade2051b
--- /dev/null
+++ b/scripts/rollback-latest.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# rollback-latest.sh — moves the :latest tag on ghcr.io/molecule-ai/platform
+# (and the matching tenant image) back to a prior :staging-<sha> digest
+# without rebuilding anything. Prod tenants auto-pull :latest every 5
+# min, so this is the fast path when a canary-verified image turns out
+# to have a runtime regression that canary didn't catch.
+#
+# Usage:
+#   scripts/rollback-latest.sh <sha>
+#   scripts/rollback-latest.sh 4c1d56e
+#
+# Prereqs:
+#   - crane on $PATH (brew install crane OR download from
+#     https://github.com/google/go-containerregistry/releases)
+#   - GHCR token exported as GITHUB_TOKEN with write:packages scope
+#
+# What it does (per image — platform + tenant):
+#   crane digest ghcr.io/…:<sha>         # verify the target sha exists
+#   crane tag    ghcr.io/…:<sha> latest  # retag remotely, single API call
+#   crane digest ghcr.io/…:latest        # confirm the move
+#
+# Exit codes: 0 = both retagged, 1 = tag missing / crane error, 2 = bad args.
+
+set -euo pipefail
+
+if [ "${1:-}" = "" ]; then
+  echo "usage: $0 <staging-sha>" >&2
+  echo "  e.g. $0 4c1d56e — retags :latest to :staging-4c1d56e" >&2
+  exit 2
+fi
+
+TARGET_SHA="$1"
+PLATFORM=ghcr.io/molecule-ai/platform
+TENANT=ghcr.io/molecule-ai/platform-tenant
+
+if ! command -v crane >/dev/null; then
+  echo "ERROR: crane not installed. brew install crane" >&2
+  exit 1
+fi
+if [ -z "${GITHUB_TOKEN:-}" ]; then
+  echo "ERROR: GITHUB_TOKEN unset. export it with write:packages scope." >&2
+  exit 1
+fi
+
+# Log in once. crane stores creds in a config file keyed by registry;
+# re-running is cheap.
+printf '%s\n' "$GITHUB_TOKEN" | crane auth login ghcr.io -u "${GITHUB_ACTOR:-$(whoami)}" --password-stdin >/dev/null
+
+roll() {
+  local image="$1"
+  local src="$image:staging-$TARGET_SHA"
+  local dst="$image:latest"
+
+  echo "→ $image"
+  # Abort rollout if the target tag doesn't exist in the registry.
+  # Otherwise crane tag would error anyway, but a pre-check gives a
+  # clearer message for ops.
+  if ! crane digest "$src" >/dev/null 2>&1; then
+    echo "  FAIL: $src not found in registry. Did you type the wrong sha?" >&2
+    return 1
+  fi
+  src_digest=$(crane digest "$src")
+
+  crane tag "$src" latest
+  new_digest=$(crane digest "$dst")
+
+  if [ "$new_digest" != "$src_digest" ]; then
+    echo "  FAIL: $dst digest $new_digest does not match expected $src_digest" >&2
+    return 1
+  fi
+  echo "  OK   $dst → $new_digest"
+}
+
+roll "$PLATFORM"
+roll "$TENANT"
+
+echo
+echo "=== ROLLBACK COMPLETE ==="
+echo "Both images now point :latest at staging-$TARGET_SHA."
+echo "Prod tenants will pick up the rollback within their 5-min auto-update cycle."