From 9d4ab7b1a2a51f785a2d8552e4fba6f7cd9af15a Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 13:46:39 -0700
Subject: [PATCH 1/7] =?UTF-8?q?feat(ci):=20auto-promote-on-e2e=20=E2=80=94?=
 =?UTF-8?q?=20retag=20:latest=20on=20green=20E2E=20Staging=20SaaS?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the final gap in the SaaS pipeline. After auto-promote-staging
fast-forwards main, publish-workspace-server-image builds new
`:staging-<sha>` images, but `:latest` (what prod tenants pull) only
moves on either a manual `promote-latest.yml` dispatch or a canary-
verify retag (gated on Phase 2 fleet that doesn't exist).

This workflow closes that gap by retagging
`platform:staging-<sha>` + `platform-tenant:staging-<sha>` → `:latest`
whenever E2E Staging SaaS passes for a `main` push. Uses crane
(no Docker daemon needed). Verifies both images exist before retagging
either, so a half-published state is impossible.

Why trigger only on `main` (not staging):
  - `:latest` is what prod tenants pull. Only SHAs that have reached
    `main` (via auto-promote-staging) should advance `:latest`.
  - Triggering on staging would let a staging-only revert advance
    `:latest` to a SHA that never reaches `main`, breaking the
    invariant "production runs what's on `main`".

Why a separate workflow rather than folding into e2e-staging-saas.yml:
  - Test concerns and release concerns separate.
  - Disabling promote during an incident is one workflow toggle, not
    an edit to the long E2E file.
  - When Phase 2 canary work eventually lands, the canary path can
    replace this trigger without touching the E2E workflow.

Doc-aligned: per molecule-controlplane/docs/canary-tenants.md,
"green staging E2E → :latest" is the recommended approach for the
current scale (≤20 paying tenants); canary fleet is deferred until
blast radius grows.

Pipeline after this lands is fully self-healing:
  staging push → 4 gates green → auto-promote fast-forwards main
   → publish-workspace-server-image → E2E Staging SaaS
   → THIS WORKFLOW retags :latest → tenant fleet auto-pulls in 5 min
                                    (or redeploy-tenants-on-main fans out faster)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/auto-promote-on-e2e.yml | 114 ++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 .github/workflows/auto-promote-on-e2e.yml
diff --git a/.github/workflows/auto-promote-on-e2e.yml b/.github/workflows/auto-promote-on-e2e.yml
new file mode 100644
index 00000000..21f901e9
--- /dev/null
+++ b/.github/workflows/auto-promote-on-e2e.yml
@@ -0,0 +1,114 @@
+name: Auto-promote :latest on E2E green
+
+# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-<sha>`
+# → `:latest` whenever E2E Staging SaaS passes for a `main` push.
+#
+# This is the doc-aligned alternative to the (deferred) Phase 2 canary
+# fleet — staging E2E catches ~90% of what canary would catch at 0%
+# ongoing infra cost. See `molecule-controlplane/docs/canary-tenants.md`
+# section "Do we actually need canary right now?" — recommended
+# sequencing for the current scale (≤20 paying tenants).
+#
+# Why a separate workflow rather than folding into e2e-staging-saas.yml:
+#   - Keeps test concerns separate from release concerns.
+#   - Disabling promote (e.g. during an incident) is one toggle, not an
+#     edit to the long E2E workflow file.
+#   - When Phase 2 canary work eventually lands, the canary path can
+#     replace this file's trigger without touching the E2E workflow.
+#
+# Why trigger on `main` only:
+#   - `:latest` is what prod tenants pull. We only want SHAs that have
+#     reached `main` (via auto-promote-staging) to advance `:latest`.
+#   - Triggering on staging would let a staging-only revert advance
+#     `:latest` to a SHA that never reaches `main`, breaking the
+#     "production runs what's on `main`" invariant.
+
+on:
+  workflow_run:
+    workflows: ['E2E Staging SaaS (full lifecycle)']
+    types: [completed]
+    branches: [main]
+  workflow_dispatch:
+    inputs:
+      sha:
+        description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)'
+        required: false
+        type: string
+
+permissions:
+  contents: read
+  packages: write
+
+env:
+  IMAGE_NAME: ghcr.io/molecule-ai/platform
+  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
+
+jobs:
+  promote:
+    # Skip if E2E failed — `:latest` stays on the prior known-good
+    # digest. Manual dispatch always proceeds (the operator already
+    # decided to promote).
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Compute short sha
+        id: sha
+        run: |
+          set -euo pipefail
+          if [ -n "${{ github.event.inputs.sha }}" ]; then
+            FULL="${{ github.event.inputs.sha }}"
+          else
+            FULL="${{ github.event.workflow_run.head_sha }}"
+          fi
+          echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT"
+          echo "full=${FULL}" >> "$GITHUB_OUTPUT"
+
+      - uses: imjasonh/setup-crane@v0.4
+
+      - name: GHCR login
+        run: |
+          echo "${{ secrets.GITHUB_TOKEN }}" | \
+            crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
+
+      - name: Verify :staging-<sha> exists for both images
+        # Better to fail fast with a clear message than to half-tag
+        # (platform retagged but platform-tenant missing → tenants pull
+        # a stale image).
+        run: |
+          set -euo pipefail
+          for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do
+            tag="${img}:staging-${{ steps.sha.outputs.short }}"
+            if ! crane manifest "$tag" >/dev/null 2>&1; then
+              echo "::error::Missing tag: $tag"
+              echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote-on-e2e can retag :latest."
+              exit 1
+            fi
+            echo "  ok: $tag exists"
+          done
+
+      - name: Retag platform :staging-<sha> → :latest
+        run: |
+          crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
+
+      - name: Retag tenant :staging-<sha> → :latest
+        run: |
+          crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
+
+      - name: Summary
+        run: |
+          {
+            echo "## E2E green → :latest promoted"
+            echo
+            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+              echo "- Trigger: manual dispatch"
+            else
+              echo "- Upstream E2E run: ${{ github.event.workflow_run.html_url }}"
+            fi
+            echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest"
+            echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest"
+            echo
+            echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true."
+            echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml."
+          } >> "$GITHUB_STEP_SUMMARY"

From 2c8792d3e019811bfbbe63b858862f9ad9149c1d Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 14:02:50 -0700
Subject: [PATCH 2/7] fix(ci): printf format-string sink + filename word-split
 in secret-scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two latent bash bugs in the canonical secret-scan workflow caught
during the post-merge review of molecule-controlplane #301 (a
private consumer that inlined this workflow's logic and got both
fixes there). Same bugs apply here; fixing in canonical means every
public consumer (gh-identity, github-app-auth, the 8 workspace
template repos) inherits the fix on their next workflow_call.

Bug 1: `printf "$OFFENDING"` is a format-string sink.

  OFFENDING is built from filenames: `${f} (matched: ${pattern})\n`.
  When passed to printf as the first argument, `%` characters in a
  filename are interpreted as conversion specifiers — corrupting the
  error message or printing `%(missing)` artifacts. No filename in
  the current tree triggers it, but a future test fixture, build
  artifact, or contributor-supplied path could.

  Fix: `printf '%b' "$OFFENDING"` interprets the literal `\n` we
  appended without treating OFFENDING as a format string.

Bug 2: `for f in $CHANGED` word-splits on whitespace.

  Filenames containing spaces would split into multiple tokens. The
  self-exclude check (`[ "$f" = "$SELF" ] && continue`) and the diff
  lookup would both operate on partial-path tokens. No filename in
  the current tree has whitespace, but the failure would be silent
  if one ever did.

  Fix: `while IFS= read -r f; do ... done <<< "$CHANGED"` reads
  whole lines as filenames. Added `[ -z "$f" ] && continue` to
  match the original `for` loop's implicit empty-input skip.

Both fixes are mechanically straightforward (~16 lines net diff,
mostly comments documenting the why). No behavior change for
filenames in the current tree; strictly better for the edge cases.

The same fixes already shipped in molecule-controlplane via #301
which inlined a copy of this workflow. The runtime's bundled
pre-commit hook (molecule-ai-workspace-runtime:
molecule_runtime/scripts/pre-commit-checks.sh) likely has the same
bugs — flagged as a follow-up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/secret-scan.yml | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/secret-scan.yml b/.github/workflows/secret-scan.yml
index 2d1e557e..cebf89e9 100644
--- a/.github/workflows/secret-scan.yml
+++ b/.github/workflows/secret-scan.yml
@@ -148,7 +148,13 @@ jobs:
           SELF=".github/workflows/secret-scan.yml"
 
           OFFENDING=""
-          for f in $CHANGED; do
+          # `while IFS= read -r` (not `for f in $CHANGED`) so filenames
+          # containing whitespace don't word-split silently — a path
+          # with a space would otherwise produce two iterations on
+          # tokens that aren't real filenames, breaking the
+          # self-exclude + diff lookup.
+          while IFS= read -r f; do
+            [ -z "$f" ] && continue
             [ "$f" = "$SELF" ] && continue
             if [ -n "$DIFF_RANGE" ]; then
               ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
@@ -164,11 +170,18 @@ jobs:
                 break
               fi
             done
-          done
+          done <<< "$CHANGED"
 
           if [ -n "$OFFENDING" ]; then
             echo "::error::Credential-shaped strings detected in diff additions:"
-            printf "$OFFENDING"
+            # `printf '%b' "$OFFENDING"` interprets backslash escapes
+            # (the literal `\n` we appended above becomes a newline)
+            # WITHOUT treating OFFENDING as a format string. Plain
+            # `printf "$OFFENDING"` is a format-string sink: a filename
+            # containing `%` would be interpreted as a conversion
+            # specifier, corrupting the error message (or printing
+            # `%(missing)` artifacts).
+            printf '%b' "$OFFENDING"
             echo ""
             echo "The actual matched values are NOT echoed here, deliberately —"
             echo "round-tripping a leaked credential into CI logs widens the blast"

From 8ff0748ab9af4948c5b9592a4f61a2b54b7b2cff Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 14:10:29 -0700
Subject: [PATCH 3/7] fix(workspace): keep peers visible in coordinator prompt
 when agent_card is null
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug: a Design Director coordinator with 6 freshly-created worker peers
rendered an empty `## Your Peers` section in its system prompt — the
hosting registry endpoint correctly returned all 6 peers, but
`summarize_peer_cards()` silently dropped every entry whose
`agent_card` column was null (the default until A2A discovery has
run end-to-end against the worker). The coordinator then refused to
delegate any task because "no peers exist".

Fix: fall back to the registry row's `name` and `role` columns when
`agent_card` is missing, malformed, or wrong-typed, instead of
skipping the peer. The registry endpoint
(`workspace-server/internal/handlers/discovery.go:queryPeerMaps`) has
always returned both fields — they were just being thrown away on
the consumer side. `build_peer_section()` now renders `Role: …` when
the agent_card-derived skill list is empty so the coordinator's
prompt still has something concrete to delegate against.

Also hoists `import json` out of the per-peer loop body to module
level (was previously imported once per iteration).

Tests: new `test_shared_runtime_peer_summary.py` pins all four
fallback cases (null / malformed string / wrong type / null + no
DB name) plus the agent-card-present happy path and the mixed-list
case the coordinator actually consumes. First peer-summary test
coverage `shared_runtime.py` has had — no prior tests existed.

Refs: 2026-04-27 Design Director discovery report from infra team.
---
 workspace/shared_runtime.py                   |  44 ++++---
 .../tests/test_shared_runtime_peer_summary.py | 111 ++++++++++++++++++
 2 files changed, 141 insertions(+), 14 deletions(-)
 create mode 100644 workspace/tests/test_shared_runtime_peer_summary.py

diff --git a/workspace/shared_runtime.py b/workspace/shared_runtime.py
index dba05700..a874356a 100644
--- a/workspace/shared_runtime.py
+++ b/workspace/shared_runtime.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import json
 from typing import Any
 
 from a2a.server.agent_execution import RequestContext
@@ -89,33 +90,46 @@ def append_peer_guidance(
 
 
 def summarize_peer_cards(peers: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    """Return compact peer metadata for prompt rendering."""
+    """Return compact peer metadata for prompt rendering.
+
+    Falls back to the registry row's `name` and `role` when `agent_card` is
+    null or unparseable so peers stay visible to delegators even before
+    their A2A discovery roundtrip has populated a card. Without this
+    fallback a coordinator-tier workspace with N freshly-created worker
+    peers would render an empty `## Your Peers` section and refuse to
+    delegate (the regression behind the 2026-04-27 Design Director
+    discovery bug).
+    """
     summaries: list[dict[str, Any]] = []
     for peer in peers:
         agent_card = peer.get("agent_card")
-        if not agent_card:
-            continue
         if isinstance(agent_card, str):
             try:
-                import json
-
                 agent_card = json.loads(agent_card)
             except Exception:
-                continue
+                agent_card = None
         if not isinstance(agent_card, dict):
-            continue
+            agent_card = None
+
+        if agent_card:
+            skills_raw = agent_card.get("skills") or []
+            skills = [
+                s.get("name", s.get("id", ""))
+                for s in skills_raw
+                if isinstance(s, dict)
+            ]
+            name = agent_card.get("name") or peer.get("name") or "Unknown"
+        else:
+            skills = []
+            name = peer.get("name") or "Unknown"
 
-        skills = agent_card.get("skills", [])
         summaries.append(
             {
                 "id": peer.get("id", "unknown"),
-                "name": agent_card.get("name", peer.get("name", "Unknown")),
+                "name": name,
+                "role": peer.get("role") or "",
                 "status": peer.get("status", "unknown"),
-                "skills": [
-                    s.get("name", s.get("id", ""))
-                    for s in skills
-                    if isinstance(s, dict)
-                ],
+                "skills": skills,
             }
         )
     return summaries
@@ -140,6 +154,8 @@ def build_peer_section(
         parts.append(f"- **{peer['name']}** (id: `{peer['id']}`, status: {peer['status']})")
         if peer["skills"]:
             parts.append(f"  Skills: {', '.join(peer['skills'])}")
+        elif peer.get("role"):
+            parts.append(f"  Role: {peer['role']}")
         parts.append("")
     parts.append(instruction)
     return "\n".join(parts)
diff --git a/workspace/tests/test_shared_runtime_peer_summary.py b/workspace/tests/test_shared_runtime_peer_summary.py
new file mode 100644
index 00000000..2628c279
--- /dev/null
+++ b/workspace/tests/test_shared_runtime_peer_summary.py
@@ -0,0 +1,111 @@
+"""Pin peer-summary fallback when agent_card is missing.
+
+Regresses the 2026-04-27 Design Director discovery bug:
+`summarize_peer_cards()` previously skipped any peer whose `agent_card`
+was null or unparseable, so a coordinator with freshly-created workers
+saw an empty `## Your Peers` section in its system prompt and refused
+to delegate. The registry endpoint already returns DB `name` + `role`
+on every row regardless of agent_card state — falling back to those
+keeps peers visible while A2A discovery catches up.
+"""
+
+from __future__ import annotations
+
+from shared_runtime import build_peer_section, summarize_peer_cards
+
+
+def _peer(**overrides):
+    base = {
+        "id": "ws-1",
+        "name": "DB Name",
+        "role": "DB Role",
+        "status": "active",
+        "agent_card": None,
+    }
+    base.update(overrides)
+    return base
+
+
+def test_summarize_includes_peer_with_null_agent_card_using_db_fields():
+    summaries = summarize_peer_cards([_peer()])
+    assert len(summaries) == 1
+    assert summaries[0]["id"] == "ws-1"
+    assert summaries[0]["name"] == "DB Name"
+    assert summaries[0]["role"] == "DB Role"
+    assert summaries[0]["status"] == "active"
+    assert summaries[0]["skills"] == []
+
+
+def test_summarize_prefers_agent_card_name_over_db_name():
+    peer = _peer(
+        agent_card={"name": "Card Name", "skills": [{"name": "draft-spec"}]}
+    )
+    summaries = summarize_peer_cards([peer])
+    assert summaries[0]["name"] == "Card Name"
+    assert summaries[0]["skills"] == ["draft-spec"]
+    assert summaries[0]["role"] == "DB Role"
+
+
+def test_summarize_handles_string_agent_card_json():
+    peer = _peer(agent_card='{"name": "JSON Name", "skills": []}')
+    summaries = summarize_peer_cards([peer])
+    assert summaries[0]["name"] == "JSON Name"
+
+
+def test_summarize_falls_back_when_agent_card_string_is_malformed():
+    peer = _peer(agent_card="not-valid-json")
+    summaries = summarize_peer_cards([peer])
+    assert len(summaries) == 1
+    assert summaries[0]["name"] == "DB Name"
+    assert summaries[0]["role"] == "DB Role"
+    assert summaries[0]["skills"] == []
+
+
+def test_summarize_falls_back_when_agent_card_is_wrong_type():
+    peer = _peer(agent_card=42)
+    summaries = summarize_peer_cards([peer])
+    assert len(summaries) == 1
+    assert summaries[0]["name"] == "DB Name"
+
+
+def test_summarize_handles_missing_role_and_name_with_unknown_default():
+    peer = {"id": "ws-2", "status": "active", "agent_card": None}
+    summaries = summarize_peer_cards([peer])
+    assert summaries[0]["name"] == "Unknown"
+    assert summaries[0]["role"] == ""
+
+
+def test_build_peer_section_renders_role_when_skills_empty():
+    section = build_peer_section([_peer()])
+    assert "## Your Peers" in section
+    assert "**DB Name**" in section
+    assert "Role: DB Role" in section
+    assert "Skills:" not in section
+
+
+def test_build_peer_section_prefers_skills_over_role_when_card_present():
+    peer = _peer(
+        agent_card={"name": "Worker", "skills": [{"name": "design"}, {"name": "review"}]}
+    )
+    section = build_peer_section([peer])
+    assert "Skills: design, review" in section
+    assert "Role: DB Role" not in section
+
+
+def test_build_peer_section_mixed_peers():
+    peers = [
+        _peer(id="ws-a"),
+        _peer(
+            id="ws-b",
+            agent_card={"name": "Card B", "skills": [{"name": "build"}]},
+        ),
+    ]
+    section = build_peer_section(peers)
+    assert "id: `ws-a`" in section
+    assert "id: `ws-b`" in section
+    assert "Role: DB Role" in section
+    assert "Skills: build" in section
+
+
+def test_build_peer_section_empty_when_no_peers():
+    assert build_peer_section([]) == ""

From 96acbd719b73dcc8b391f63662d52dac1fe593e9 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 14:15:42 -0700
Subject: [PATCH 4/7] test: update test_peer_capabilities_format for fallback
 behavior
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous assertion `'Silent Agent' not in result` was pinning
the buggy behavior — peers without an agent_card were silently
dropped from the prompt. With the fallback to DB name+role those
peers are correctly visible. Flip the assertion so the test pins
the new (correct) rendering and would catch a regression to the
silent-drop behavior.
---
 workspace/tests/test_prompt.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/workspace/tests/test_prompt.py b/workspace/tests/test_prompt.py
index 0fb4bd98..133a5d7e 100644
--- a/workspace/tests/test_prompt.py
+++ b/workspace/tests/test_prompt.py
@@ -203,8 +203,11 @@ def test_peer_capabilities_format(tmp_path):
     assert "**Echo Agent** (id: `peer-1`, status: online)" in result
     assert "Skills: echo, repeat" in result
     assert "delegate_to_workspace" in result
-    # peer-2 has no agent_card so it's skipped
-    assert "Silent Agent" not in result
+    # peer-2 has no agent_card but DOES have a DB name + status — must
+    # still render so coordinators can delegate to freshly-created peers
+    # whose A2A discovery hasn't populated a card yet (regression of the
+    # 2026-04-27 Design Director discovery bug).
+    assert "**Silent Agent** (id: `peer-2`, status: offline)" in result
 
 
 def test_peer_with_json_string_agent_card(tmp_path):

From c59715e143c2be5b400d59d2899b5d05ebd4277b Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 14:43:43 -0700
Subject: [PATCH 5/7] =?UTF-8?q?feat(ci):=20auto-sync=20main=20=E2=86=92=20?=
 =?UTF-8?q?staging=20to=20keep=20staging-as-superset=20invariant?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Background

`auto-promote-staging.yml` advances main via `git merge --ff-only`
+ `git push origin main` — clean fast-forward, no merge commit. But
manual `staging → main` merges via the GitHub UI / API create a merge
commit on main that staging doesn't have. The next `staging → main`
PR then evaluates as "BEHIND" because staging is missing that merge
commit, requiring a manual `gh pr update-branch` round-trip.

This pattern bit twice on 2026-04-28 (PRs #2202 and #2205, both
manual bridges to land pipeline fixes themselves). Each needed
update-branch + re-CI before they could merge. Annoying and
avoidable.

What this workflow does

Triggered on every push to main (regardless of source: auto-promote,
UI merge, API merge, direct push):

  1. Check whether main is already in staging's ancestry. If yes,
     no-op — auto-promote-staging keeps them aligned via ff push,
     and the no-op case is the steady state.

  2. If not (manual merge commit on main, or direct main hotfix):
     try `git merge --ff-only origin/main` first. Works when staging
     hasn't diverged with its own commits.

  3. If ff fails (staging has its own in-flight feature work):
     `git merge --no-ff origin/main -m "chore: sync main → staging"`.
     Absorbs main's tip while keeping staging's own history.

  4. Push staging.

Loop safety

Pushing the synced staging triggers auto-promote-staging.yml, which
checks gates on staging's new tip and, if green, ff-pushes staging
to main. Since staging now ⊇ main, the resulting push to main is
either a no-op (no ref change → no push event fires → auto-sync
doesn't re-trigger) or advances main further. In the latter case
auto-sync fires once more, sees main already in staging's ancestry,
no-ops. Bounded.

Conflict handling

If the merge step hits conflicts (staging and main diverged with
incompatible changes), the workflow fails with a clear summary
pointing to manual resolution. This shouldn't happen in practice —
staging is the integration branch; conflicts indicate a direct main
hotfix touching the same code as in-flight staging work.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/auto-sync-main-to-staging.yml   | 136 ++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 .github/workflows/auto-sync-main-to-staging.yml

diff --git a/.github/workflows/auto-sync-main-to-staging.yml b/.github/workflows/auto-sync-main-to-staging.yml
new file mode 100644
index 00000000..83156254
--- /dev/null
+++ b/.github/workflows/auto-sync-main-to-staging.yml
@@ -0,0 +1,136 @@
+name: Auto-sync main → staging
+
+# Reflects every push to `main` back onto `staging` so the
+# staging-as-superset-of-main invariant holds.
+#
+# Background:
+#
+# `auto-promote-staging.yml` advances main via `git merge --ff-only`
+# + `git push origin main` — that's a clean fast-forward, no merge
+# commit. But manual merges of `staging → main` PRs through the
+# GitHub UI / API create a merge commit on main that staging
+# doesn't have. The next `staging → main` PR then evaluates as
+# "BEHIND" because staging is missing that merge commit, requiring
+# a manual `gh pr update-branch` round-trip.
+#
+# This happened twice on 2026-04-28 (PRs #2202, #2205, both manual
+# bridges). Each time the bridge needed update-branch + a re-CI
+# round before merging. Operationally annoying and avoidable.
+#
+# This workflow closes the gap automatically:
+#
+#   1. Push to main fires (regardless of source: auto-promote, UI
+#      merge, API merge, direct push).
+#   2. Check whether main is already in staging's ancestry — if
+#      yes, no-op (auto-promote-staging already kept them in sync
+#      via fast-forward).
+#   3. If not, try fast-forward staging to main first (works when
+#      staging hasn't diverged with its own commits).
+#   4. If ff fails (staging has commits main doesn't — feature work
+#      in flight), do a real merge with a "chore: sync" commit so
+#      staging absorbs main's tip while keeping its own history.
+#   5. Push staging.
+#
+# Loop safety:
+#
+# Pushing the synced staging triggers `auto-promote-staging.yml`,
+# which checks gates on staging's new tip and, if green, ff-pushes
+# staging to main. Since staging now == main (ff case) or ⊇ main
+# (merge case where promote then advances), the resulting push to
+# main is either a no-op (no actual ref change → no push event) or
+# advances main further. In the latter case auto-sync fires again,
+# sees main already in staging's ancestry, no-ops. No infinite loop.
+
+on:
+  push:
+    branches: [main]
+
+permissions:
+  contents: write
+
+jobs:
+  sync-staging:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout staging
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: staging
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Configure git author
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+
+      - name: Check if staging already contains main
+        id: check
+        run: |
+          set -euo pipefail
+          git fetch origin main
+          if git merge-base --is-ancestor origin/main HEAD; then
+            echo "needs_sync=false" >> "$GITHUB_OUTPUT"
+            {
+              echo "## ✅ No-op"
+              echo
+              echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))."
+              echo "auto-promote-staging or a previous auto-sync run already kept them aligned."
+            } >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "needs_sync=true" >> "$GITHUB_OUTPUT"
+            echo "::notice::staging is missing main's tip — sync needed"
+          fi
+
+      - name: Fast-forward staging → main
+        if: steps.check.outputs.needs_sync == 'true'
+        id: ff
+        run: |
+          set -euo pipefail
+          if git merge --ff-only origin/main; then
+            echo "did_ff=true" >> "$GITHUB_OUTPUT"
+            echo "::notice::Fast-forwarded staging to origin/main"
+          else
+            echo "did_ff=false" >> "$GITHUB_OUTPUT"
+            echo "::notice::ff failed — staging has its own commits; will create merge"
+          fi
+
+      - name: Merge main into staging (when ff fails)
+        if: |
+          steps.check.outputs.needs_sync == 'true' &&
+          steps.ff.outputs.did_ff != 'true'
+        run: |
+          set -euo pipefail
+          # ff failed because staging has commits main doesn't — typical
+          # in-flight feature work. Create a merge commit so staging
+          # absorbs main's tip while keeping its own history.
+          if ! git merge --no-ff origin/main -m "chore: sync main → staging (auto)"; then
+            {
+              echo "## ❌ Conflict"
+              echo
+              echo "Auto-merge \`main → staging\` failed with conflicts."
+              echo "A human needs to resolve manually:"
+              echo
+              echo "    git checkout staging"
+              echo "    git merge origin/main"
+              echo "    # resolve, commit, push"
+            } >> "$GITHUB_STEP_SUMMARY"
+            exit 1
+          fi
+
+      - name: Push staging
+        if: steps.check.outputs.needs_sync == 'true'
+        run: |
+          set -euo pipefail
+          git push origin staging
+          {
+            if [ "${{ steps.ff.outputs.did_ff }}" = "true" ]; then
+              echo "## ✅ staging fast-forwarded"
+              echo
+              echo "staging is now at \`$(git rev-parse --short=8 HEAD)\` (== origin/main)."
+            else
+              echo "## ✅ staging absorbed main"
+              echo
+              echo "staging is now at \`$(git rev-parse --short=8 HEAD)\` with a merge commit absorbing main's tip."
+            fi
+          } >> "$GITHUB_STEP_SUMMARY"

From 97d5883e76ece0c99886030168d3b1fb22e203a3 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 14:59:23 -0700
Subject: [PATCH 6/7] fix(ci): auto-sync concurrency + cleanup follow-ups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three small fixes from the self-review of #2209:

1. **Required: concurrency group.** Two pushes to main in quick
   succession (manual UI merge then auto-promote-staging's ff-push,
   or any back-to-back main pushes) would race two auto-sync runs
   against the same staging branch — second `git push origin staging`
   fails non-fast-forward, surfacing as a red CI alert for what should
   be a no-op. Add `concurrency: { group: auto-sync-main-to-staging,
   cancel-in-progress: false }` so the second run waits for the first
   and sees its result.

2. **Hygiene: `git merge --abort` on conflict.** The conflict-error
   path exits 1 with the work tree in a half-merged state. Doesn't
   affect future runs (each gets a fresh checkout) but is an
   unpleasant artifact for anyone who shells into the runner. Abort
   first, then exit.

3. **Doc accuracy: "Loop safety" comment.** The original said the
   chain terminates because "main is either a no-op or advances
   further." That's true but understates the actual safety: GitHub
   Actions explicitly does NOT trigger downstream workflow runs from
   `GITHUB_TOKEN`-authored pushes. So the loop is impossible by
   construction, not just by happy coincidence of ref state. Updated
   the comment to reflect the actual mechanism.

Plus a step-name nit: "Fast-forward staging → main" reads as if main
is the target. Renamed to "Fast-forward staging to main" for
consistency with the workflow's name (main → staging).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/auto-sync-main-to-staging.yml   | 35 +++++++++++++------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/auto-sync-main-to-staging.yml b/.github/workflows/auto-sync-main-to-staging.yml
index 83156254..278c3428 100644
--- a/.github/workflows/auto-sync-main-to-staging.yml
+++ b/.github/workflows/auto-sync-main-to-staging.yml
@@ -33,13 +33,19 @@ name: Auto-sync main → staging
 #
 # Loop safety:
 #
-# Pushing the synced staging triggers `auto-promote-staging.yml`,
-# which checks gates on staging's new tip and, if green, ff-pushes
-# staging to main. Since staging now == main (ff case) or ⊇ main
-# (merge case where promote then advances), the resulting push to
-# main is either a no-op (no actual ref change → no push event) or
-# advances main further. In the latter case auto-sync fires again,
-# sees main already in staging's ancestry, no-ops. No infinite loop.
+# `GITHUB_TOKEN`-authored pushes do NOT trigger downstream workflow
+# runs by default (GitHub Actions safety). So when this workflow
+# pushes the synced staging, `auto-promote-staging.yml` is NOT
+# triggered by that push. The next developer push to staging triggers
+# auto-promote normally. No loop is even theoretically possible.
+#
+# Concurrency:
+#
+# Two pushes to main in quick succession (e.g., manual UI merge
+# immediately followed by auto-promote-staging's ff-merge) would
+# otherwise race two auto-sync runs against the same staging branch
+# — second push fails non-fast-forward. The concurrency group
+# serializes them so the second run sees the first's result.
 
 on:
   push:
@@ -48,6 +54,10 @@ on:
 permissions:
   contents: write
 
+concurrency:
+  group: auto-sync-main-to-staging
+  cancel-in-progress: false
+
 jobs:
   sync-staging:
     runs-on: ubuntu-latest
@@ -82,7 +92,7 @@ jobs:
             echo "::notice::staging is missing main's tip — sync needed"
           fi
 
-      - name: Fast-forward staging → main
+      - name: Fast-forward staging to main
         if: steps.check.outputs.needs_sync == 'true'
         id: ff
         run: |
@@ -96,15 +106,18 @@ jobs:
           fi
 
       - name: Merge main into staging (when ff fails)
-        if: |
-          steps.check.outputs.needs_sync == 'true' &&
-          steps.ff.outputs.did_ff != 'true'
+        if: steps.check.outputs.needs_sync == 'true' && steps.ff.outputs.did_ff != 'true'
         run: |
           set -euo pipefail
           # ff failed because staging has commits main doesn't — typical
           # in-flight feature work. Create a merge commit so staging
           # absorbs main's tip while keeping its own history.
           if ! git merge --no-ff origin/main -m "chore: sync main → staging (auto)"; then
+            # Hygiene: leave the work tree clean before failing. Doesn't
+            # affect future runs (each gets a fresh checkout) but a
+            # half-merged tree is an unpleasant artifact to debug if
+            # anyone ever shells into the runner.
+            git merge --abort || true
             {
               echo "## ❌ Conflict"
               echo

From 6638d6e1d7984f5ab057e9b3a423b2592ac807e7 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 15:29:09 -0700
Subject: [PATCH 7/7] feat(ci): SECRET_PATTERNS drift lint across known
 consumers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a lint that diffs the canonical SECRET_PATTERNS array in
.github/workflows/secret-scan.yml against every known public
consumer mirror, failing on any divergence.

Why: every side that scans for credentials carries its own copy of
the pattern list. They drift — most recently the workspace-runtime
pre-commit hook lagged the canonical by one pattern (sk-cp- /
MiniMax F1088 vector), so a developer's local pre-commit would let
a sk-cp- token through while the org-wide CI scan would refuse it.
Useless friction; automated detection closes the gap.

Implementation:
  .github/scripts/lint_secret_pattern_drift.py — pure stdlib, fetches
    each consumer's RAW file via urllib, extracts the
    SECRET_PATTERNS=( ... ) array via anchored regex (the closing
    `)` is anchored to the start of a line because pattern comments
    like `# GitHub PAT (classic)` contain their own paren mid-line),
    diffs against canonical, fails on missing or extra patterns.
    Fetch failures are warnings, not errors — a consumer whose
    branch was renamed shouldn't fail the lint until someone updates
    the URL list.

  .github/workflows/secret-pattern-drift.yml — daily 05:00 UTC cron
    + on-push gate (when canonical, the workflow, or the script
    changes) + workflow_dispatch. Read-only token, 5-minute timeout.

Initial consumer set: workspace-runtime's bundled pre-commit hook
(the one that drifted on sk-cp-). molecule-controlplane's inlined
copy is private so this workflow can't read it; that's tracked
separately and the controlplane's own self-monitor is the gap.

Verified locally: lint detects drift correctly when the runtime
hook is missing sk-cp-, returns clean when aligned.

Refs: task #139.
---
 .github/scripts/lint_secret_pattern_drift.py | 134 +++++++++++++++++++
 .github/workflows/secret-pattern-drift.yml   |  57 ++++++++
 2 files changed, 191 insertions(+)
 create mode 100644 .github/scripts/lint_secret_pattern_drift.py
 create mode 100644 .github/workflows/secret-pattern-drift.yml

diff --git a/.github/scripts/lint_secret_pattern_drift.py b/.github/scripts/lint_secret_pattern_drift.py
new file mode 100644
index 00000000..6c1b7965
--- /dev/null
+++ b/.github/scripts/lint_secret_pattern_drift.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Lint SECRET_PATTERNS drift across known consumers of molecule-core's canonical.
+
+The canonical SECRET_PATTERNS array in
+.github/workflows/secret-scan.yml is mirrored by every other side
+that scans for credentials: the workspace-runtime's bundled
+pre-commit hook, the molecule-controlplane inlined copy, etc. The
+mirror is enforced socially today — when someone adds a new pattern
+to canonical (e.g. the sk-cp- MiniMax token after F1088), the other
+sides are supposed to be updated in lockstep.
+
+This script automates the check. Diffs the canonical's pattern set
+against each known public consumer and exits non-zero on any
+mismatch. Wired into a daily cron + on-push gate via
+.github/workflows/secret-pattern-drift.yml.
+
+Private-repo consumers (currently molecule-controlplane's inlined
+copy) are out of scope here because the molecule-core workflow's
+GITHUB_TOKEN can't read other private repos in the org. They're
+expected to self-monitor via their own copy of this script — not a
+hard barrier, just a future expansion.
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+import urllib.request
+from pathlib import Path
+
+CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
+
+# Public consumer mirrors. Each entry is (label, raw_url) — raw_url
+# points at the file's RAW content on the consumer's default branch
+# (or staging where applicable). Add an entry here when a new public
+# repo starts shipping its own SECRET_PATTERNS array.
+CONSUMERS: list[tuple[str, str]] = [
+    (
+        "molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh",
+        "https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh",
+    ),
+]
+
+# Matches the SECRET_PATTERNS=( ... ) array in either yaml-indented
+# (the canonical workflow's `run:` block) or shell-flat (runtime
+# hook) format. Patterns inside are single-quoted Bash strings; we
+# pull each via _PATTERN_RE.
+#
+# Closing `)` is anchored to the start of a line (possibly indented)
+# because pattern comments like `# GitHub PAT (classic)` contain
+# their own `)` mid-line — a non-anchored regex would match through
+# the comment's paren and capture only the first pattern.
+_ARRAY_RE = re.compile(r"SECRET_PATTERNS=\((.*?)^\s*\)", re.DOTALL | re.MULTILINE)
+_PATTERN_RE = re.compile(r"'([^']+)'")
+
+
+def extract_patterns(content: str, source_label: str) -> list[str]:
+    """Pull the SECRET_PATTERNS list out of either format. Raises if missing."""
+    m = _ARRAY_RE.search(content)
+    if not m:
+        raise SystemExit(f"::error::{source_label}: SECRET_PATTERNS=(...) array not found")
+    return _PATTERN_RE.findall(m.group(1))
+
+
+def fetch(url: str) -> str:
+    req = urllib.request.Request(
+        url, headers={"User-Agent": "secret-pattern-drift-lint/1"}
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return resp.read().decode("utf-8")
+
+
+def diff_patterns(canonical: list[str], consumer: list[str]) -> tuple[list[str], list[str]]:
+    """Return (missing_from_consumer, extra_in_consumer) — both sorted."""
+    canonical_set = set(canonical)
+    consumer_set = set(consumer)
+    return (
+        sorted(canonical_set - consumer_set),
+        sorted(consumer_set - canonical_set),
+    )
+
+
+def main() -> int:
+    if not CANONICAL_FILE.exists():
+        print(f"::error::canonical not found at {CANONICAL_FILE}")
+        return 1
+
+    canonical = extract_patterns(CANONICAL_FILE.read_text(), str(CANONICAL_FILE))
+    print(f"canonical ({CANONICAL_FILE}): {len(canonical)} patterns")
+
+    drift = False
+    for label, url in CONSUMERS:
+        try:
+            content = fetch(url)
+        except Exception as e:
+            # Fetch failures are warnings, not errors. A consumer
+            # whose default branch was just renamed (or whose file
+            # moved) shouldn't fail the lint until someone updates
+            # the URL above. Real drift is the failure mode this
+            # gate exists to catch — fetch reliability isn't.
+            print(f"::warning::{label}: fetch failed ({e}) — skipping")
+            continue
+
+        consumer = extract_patterns(content, label)
+        missing, extra = diff_patterns(canonical, consumer)
+        if not missing and not extra:
+            print(f"  ✓ {label}: aligned ({len(consumer)} patterns)")
+            continue
+
+        drift = True
+        print(f"::error::DRIFT in {label}:")
+        for p in missing:
+            print(f"  -  missing from consumer: {p!r}")
+        for p in extra:
+            print(f"  -  extra in consumer (not in canonical): {p!r}")
+
+    if drift:
+        print()
+        print("::error::SECRET_PATTERNS drift detected. Bring consumer(s) into")
+        print("alignment with the canonical SECRET_PATTERNS array in")
+        print(f"{CANONICAL_FILE} by adding the missing patterns and removing")
+        print("any extras. The two sides must stay byte-aligned on the pattern")
+        print("list — the runtime hook is the developer's local pre-commit,")
+        print("the canonical is the org-wide CI gate, divergence means a token")
+        print("can pass one but get rejected by the other.")
+        return 1
+
+    print()
+    print("✓ All known consumers aligned with canonical SECRET_PATTERNS.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/workflows/secret-pattern-drift.yml b/.github/workflows/secret-pattern-drift.yml
new file mode 100644
index 00000000..554bab35
--- /dev/null
+++ b/.github/workflows/secret-pattern-drift.yml
@@ -0,0 +1,57 @@
+name: SECRET_PATTERNS drift lint
+
+# Detects when the canonical SECRET_PATTERNS array in
+# .github/workflows/secret-scan.yml diverges from known consumer
+# mirrors (workspace-runtime's bundled pre-commit hook today; more
+# can be added as the consumer set grows).
+#
+# Why this exists: every side that scans for credentials has its own
+# copy of the pattern list. They drift — most recently the runtime
+# hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088),
+# so a developer's local pre-commit would let a sk-cp- token through
+# while the org-wide CI scan would refuse it. The cost of that drift
+# is dev confusion + delayed feedback; the fix is automated detection.
+#
+# Triggers:
+#   - schedule: daily 05:00 UTC. Catches drift introduced by edits
+#     to a consumer copy that didn't update canonical here.
+#   - push to main/staging where the canonical or this lint changed:
+#     catches the inverse — canonical updated but consumers not yet
+#     bumped. The lint will fail the push; that's intentional, the
+#     person editing canonical is the right person to also update
+#     the consumer.
+#   - workflow_dispatch: ad-hoc operator runs.
+
+on:
+  schedule:
+    # 05:00 UTC = 22:00 PT / 01:00 ET. Quiet hours so a failure
+    # email lands when humans are starting their day, not
+    # interrupting it.
+    - cron: "0 5 * * *"
+  push:
+    branches: [main, staging]
+    paths:
+      - ".github/workflows/secret-scan.yml"
+      - ".github/workflows/secret-pattern-drift.yml"
+      - ".github/scripts/lint_secret_pattern_drift.py"
+  workflow_dispatch:
+
+# GITHUB_TOKEN scoped to read-only. The lint only does git checkout
+# + HTTPS GETs to public consumer files; no writes to anything.
+permissions:
+  contents: read
+
+jobs:
+  lint:
+    name: Detect SECRET_PATTERNS drift
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Run drift lint
+        run: python3 .github/scripts/lint_secret_pattern_drift.py