From 9d4ab7b1a2a51f785a2d8552e4fba6f7cd9af15a Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 13:46:39 -0700 Subject: [PATCH 1/7] =?UTF-8?q?feat(ci):=20auto-promote-on-e2e=20=E2=80=94?= =?UTF-8?q?=20retag=20:latest=20on=20green=20E2E=20Staging=20SaaS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the final gap in the SaaS pipeline. After auto-promote-staging fast-forwards main, publish-workspace-server-image builds new `:staging-` images, but `:latest` (what prod tenants pull) only moves on either a manual `promote-latest.yml` dispatch or a canary- verify retag (gated on Phase 2 fleet that doesn't exist). This workflow closes that gap by retagging `platform:staging-` + `platform-tenant:staging-` → `:latest` whenever E2E Staging SaaS passes for a `main` push. Uses crane (no Docker daemon needed). Verifies both images exist before retagging either, so a half-published state is impossible. Why trigger only on `main` (not staging): - `:latest` is what prod tenants pull. Only SHAs that have reached `main` (via auto-promote-staging) should advance `:latest`. - Triggering on staging would let a staging-only revert advance `:latest` to a SHA that never reaches `main`, breaking the invariant "production runs what's on `main`". Why a separate workflow rather than folding into e2e-staging-saas.yml: - Test concerns and release concerns separate. - Disabling promote during an incident is one workflow toggle, not an edit to the long E2E file. - When Phase 2 canary work eventually lands, the canary path can replace this trigger without touching the E2E workflow. Doc-aligned: per molecule-controlplane/docs/canary-tenants.md, "green staging E2E → :latest" is the recommended approach for the current scale (≤20 paying tenants); canary fleet is deferred until blast radius grows. Pipeline after this lands is fully self-healing: staging push → 4 gates green → auto-promote fast-forwards main → publish-workspace-server-image → E2E Staging SaaS → THIS WORKFLOW retags :latest → tenant fleet auto-pulls in 5 min (or redeploy-tenants-on-main fans out faster) Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/auto-promote-on-e2e.yml | 114 ++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 .github/workflows/auto-promote-on-e2e.yml diff --git a/.github/workflows/auto-promote-on-e2e.yml b/.github/workflows/auto-promote-on-e2e.yml new file mode 100644 index 00000000..21f901e9 --- /dev/null +++ b/.github/workflows/auto-promote-on-e2e.yml @@ -0,0 +1,114 @@ +name: Auto-promote :latest on E2E green + +# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-` +# → `:latest` whenever E2E Staging SaaS passes for a `main` push. +# +# This is the doc-aligned alternative to the (deferred) Phase 2 canary +# fleet — staging E2E catches ~90% of what canary would catch at 0% +# ongoing infra cost. See `molecule-controlplane/docs/canary-tenants.md` +# section "Do we actually need canary right now?" — recommended +# sequencing for the current scale (≤20 paying tenants). +# +# Why a separate workflow rather than folding into e2e-staging-saas.yml: +# - Keeps test concerns separate from release concerns. +# - Disabling promote (e.g. during an incident) is one toggle, not an +# edit to the long E2E workflow file. +# - When Phase 2 canary work eventually lands, the canary path can +# replace this file's trigger without touching the E2E workflow. +# +# Why trigger on `main` only: +# - `:latest` is what prod tenants pull. We only want SHAs that have +# reached `main` (via auto-promote-staging) to advance `:latest`. +# - Triggering on staging would let a staging-only revert advance +# `:latest` to a SHA that never reaches `main`, breaking the +# "production runs what's on `main`" invariant. + +on: + workflow_run: + workflows: ['E2E Staging SaaS (full lifecycle)'] + types: [completed] + branches: [main] + workflow_dispatch: + inputs: + sha: + description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)' + required: false + type: string + +permissions: + contents: read + packages: write + +env: + IMAGE_NAME: ghcr.io/molecule-ai/platform + TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant + +jobs: + promote: + # Skip if E2E failed — `:latest` stays on the prior known-good + # digest. Manual dispatch always proceeds (the operator already + # decided to promote). + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') + runs-on: ubuntu-latest + steps: + - name: Compute short sha + id: sha + run: | + set -euo pipefail + if [ -n "${{ github.event.inputs.sha }}" ]; then + FULL="${{ github.event.inputs.sha }}" + else + FULL="${{ github.event.workflow_run.head_sha }}" + fi + echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT" + echo "full=${FULL}" >> "$GITHUB_OUTPUT" + + - uses: imjasonh/setup-crane@v0.4 + + - name: GHCR login + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | \ + crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Verify :staging- exists for both images + # Better to fail fast with a clear message than to half-tag + # (platform retagged but platform-tenant missing → tenants pull + # a stale image). + run: | + set -euo pipefail + for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do + tag="${img}:staging-${{ steps.sha.outputs.short }}" + if ! crane manifest "$tag" >/dev/null 2>&1; then + echo "::error::Missing tag: $tag" + echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote-on-e2e can retag :latest." + exit 1 + fi + echo " ok: $tag exists" + done + + - name: Retag platform :staging- → :latest + run: | + crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest + + - name: Retag tenant :staging- → :latest + run: | + crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest + + - name: Summary + run: | + { + echo "## E2E green → :latest promoted" + echo + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "- Trigger: manual dispatch" + else + echo "- Upstream E2E run: ${{ github.event.workflow_run.html_url }}" + fi + echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest" + echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest" + echo + echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true." + echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml." + } >> "$GITHUB_STEP_SUMMARY" From 2c8792d3e019811bfbbe63b858862f9ad9149c1d Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 14:02:50 -0700 Subject: [PATCH 2/7] fix(ci): printf format-string sink + filename word-split in secret-scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two latent bash bugs in the canonical secret-scan workflow caught during the post-merge review of molecule-controlplane #301 (a private consumer that inlined this workflow's logic and got both fixes there). Same bugs apply here; fixing in canonical means every public consumer (gh-identity, github-app-auth, the 8 workspace template repos) inherits the fix on their next workflow_call. Bug 1: `printf "$OFFENDING"` is a format-string sink. OFFENDING is built from filenames: `${f} (matched: ${pattern})\n`. When passed to printf as the first argument, `%` characters in a filename are interpreted as conversion specifiers — corrupting the error message or printing `%(missing)` artifacts. No filename in the current tree triggers it, but a future test fixture, build artifact, or contributor-supplied path could. Fix: `printf '%b' "$OFFENDING"` interprets the literal `\n` we appended without treating OFFENDING as a format string. Bug 2: `for f in $CHANGED` word-splits on whitespace. Filenames containing spaces would split into multiple tokens. The self-exclude check (`[ "$f" = "$SELF" ] && continue`) and the diff lookup would both operate on partial-path tokens. No filename in the current tree has whitespace, but the failure would be silent if one ever did. Fix: `while IFS= read -r f; do ... done <<< "$CHANGED"` reads whole lines as filenames. Added `[ -z "$f" ] && continue` to match the original `for` loop's implicit empty-input skip. Both fixes are mechanically straightforward (~16 lines net diff, mostly comments documenting the why). No behavior change for filenames in the current tree; strictly better for the edge cases. The same fixes already shipped in molecule-controlplane via #301 which inlined a copy of this workflow. The runtime's bundled pre-commit hook (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh) likely has the same bugs — flagged as a follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/secret-scan.yml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/secret-scan.yml b/.github/workflows/secret-scan.yml index 2d1e557e..cebf89e9 100644 --- a/.github/workflows/secret-scan.yml +++ b/.github/workflows/secret-scan.yml @@ -148,7 +148,13 @@ jobs: SELF=".github/workflows/secret-scan.yml" OFFENDING="" - for f in $CHANGED; do + # `while IFS= read -r` (not `for f in $CHANGED`) so filenames + # containing whitespace don't word-split silently — a path + # with a space would otherwise produce two iterations on + # tokens that aren't real filenames, breaking the + # self-exclude + diff lookup. + while IFS= read -r f; do + [ -z "$f" ] && continue [ "$f" = "$SELF" ] && continue if [ -n "$DIFF_RANGE" ]; then ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true) @@ -164,11 +170,18 @@ jobs: break fi done - done + done <<< "$CHANGED" if [ -n "$OFFENDING" ]; then echo "::error::Credential-shaped strings detected in diff additions:" - printf "$OFFENDING" + # `printf '%b' "$OFFENDING"` interprets backslash escapes + # (the literal `\n` we appended above becomes a newline) + # WITHOUT treating OFFENDING as a format string. Plain + # `printf "$OFFENDING"` is a format-string sink: a filename + # containing `%` would be interpreted as a conversion + # specifier, corrupting the error message (or printing + # `%(missing)` artifacts). + printf '%b' "$OFFENDING" echo "" echo "The actual matched values are NOT echoed here, deliberately —" echo "round-tripping a leaked credential into CI logs widens the blast" From 8ff0748ab9af4948c5b9592a4f61a2b54b7b2cff Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 14:10:29 -0700 Subject: [PATCH 3/7] fix(workspace): keep peers visible in coordinator prompt when agent_card is null MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug: a Design Director coordinator with 6 freshly-created worker peers rendered an empty `## Your Peers` section in its system prompt — the hosting registry endpoint correctly returned all 6 peers, but `summarize_peer_cards()` silently dropped every entry whose `agent_card` column was null (the default until A2A discovery has run end-to-end against the worker). The coordinator then refused to delegate any task because "no peers exist". Fix: fall back to the registry row's `name` and `role` columns when `agent_card` is missing, malformed, or wrong-typed, instead of skipping the peer. The registry endpoint (`workspace-server/internal/handlers/discovery.go:queryPeerMaps`) has always returned both fields — they were just being thrown away on the consumer side. `build_peer_section()` now renders `Role: …` when the agent_card-derived skill list is empty so the coordinator's prompt still has something concrete to delegate against. Also hoists `import json` out of the per-peer loop body to module level (was previously imported once per iteration). Tests: new `test_shared_runtime_peer_summary.py` pins all four fallback cases (null / malformed string / wrong type / null + no DB name) plus the agent-card-present happy path and the mixed-list case the coordinator actually consumes. First peer-summary test coverage `shared_runtime.py` has had — no prior tests existed. Refs: 2026-04-27 Design Director discovery report from infra team. --- workspace/shared_runtime.py | 44 ++++--- .../tests/test_shared_runtime_peer_summary.py | 111 ++++++++++++++++++ 2 files changed, 141 insertions(+), 14 deletions(-) create mode 100644 workspace/tests/test_shared_runtime_peer_summary.py diff --git a/workspace/shared_runtime.py b/workspace/shared_runtime.py index dba05700..a874356a 100644 --- a/workspace/shared_runtime.py +++ b/workspace/shared_runtime.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json from typing import Any from a2a.server.agent_execution import RequestContext @@ -89,33 +90,46 @@ def append_peer_guidance( def summarize_peer_cards(peers: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Return compact peer metadata for prompt rendering.""" + """Return compact peer metadata for prompt rendering. + + Falls back to the registry row's `name` and `role` when `agent_card` is + null or unparseable so peers stay visible to delegators even before + their A2A discovery roundtrip has populated a card. Without this + fallback a coordinator-tier workspace with N freshly-created worker + peers would render an empty `## Your Peers` section and refuse to + delegate (the regression behind the 2026-04-27 Design Director + discovery bug). + """ summaries: list[dict[str, Any]] = [] for peer in peers: agent_card = peer.get("agent_card") - if not agent_card: - continue if isinstance(agent_card, str): try: - import json - agent_card = json.loads(agent_card) except Exception: - continue + agent_card = None if not isinstance(agent_card, dict): - continue + agent_card = None + + if agent_card: + skills_raw = agent_card.get("skills") or [] + skills = [ + s.get("name", s.get("id", "")) + for s in skills_raw + if isinstance(s, dict) + ] + name = agent_card.get("name") or peer.get("name") or "Unknown" + else: + skills = [] + name = peer.get("name") or "Unknown" - skills = agent_card.get("skills", []) summaries.append( { "id": peer.get("id", "unknown"), - "name": agent_card.get("name", peer.get("name", "Unknown")), + "name": name, + "role": peer.get("role") or "", "status": peer.get("status", "unknown"), - "skills": [ - s.get("name", s.get("id", "")) - for s in skills - if isinstance(s, dict) - ], + "skills": skills, } ) return summaries @@ -140,6 +154,8 @@ def build_peer_section( parts.append(f"- **{peer['name']}** (id: `{peer['id']}`, status: {peer['status']})") if peer["skills"]: parts.append(f" Skills: {', '.join(peer['skills'])}") + elif peer.get("role"): + parts.append(f" Role: {peer['role']}") parts.append("") parts.append(instruction) return "\n".join(parts) diff --git a/workspace/tests/test_shared_runtime_peer_summary.py b/workspace/tests/test_shared_runtime_peer_summary.py new file mode 100644 index 00000000..2628c279 --- /dev/null +++ b/workspace/tests/test_shared_runtime_peer_summary.py @@ -0,0 +1,111 @@ +"""Pin peer-summary fallback when agent_card is missing. + +Regresses the 2026-04-27 Design Director discovery bug: +`summarize_peer_cards()` previously skipped any peer whose `agent_card` +was null or unparseable, so a coordinator with freshly-created workers +saw an empty `## Your Peers` section in its system prompt and refused +to delegate. The registry endpoint already returns DB `name` + `role` +on every row regardless of agent_card state — falling back to those +keeps peers visible while A2A discovery catches up. +""" + +from __future__ import annotations + +from shared_runtime import build_peer_section, summarize_peer_cards + + +def _peer(**overrides): + base = { + "id": "ws-1", + "name": "DB Name", + "role": "DB Role", + "status": "active", + "agent_card": None, + } + base.update(overrides) + return base + + +def test_summarize_includes_peer_with_null_agent_card_using_db_fields(): + summaries = summarize_peer_cards([_peer()]) + assert len(summaries) == 1 + assert summaries[0]["id"] == "ws-1" + assert summaries[0]["name"] == "DB Name" + assert summaries[0]["role"] == "DB Role" + assert summaries[0]["status"] == "active" + assert summaries[0]["skills"] == [] + + +def test_summarize_prefers_agent_card_name_over_db_name(): + peer = _peer( + agent_card={"name": "Card Name", "skills": [{"name": "draft-spec"}]} + ) + summaries = summarize_peer_cards([peer]) + assert summaries[0]["name"] == "Card Name" + assert summaries[0]["skills"] == ["draft-spec"] + assert summaries[0]["role"] == "DB Role" + + +def test_summarize_handles_string_agent_card_json(): + peer = _peer(agent_card='{"name": "JSON Name", "skills": []}') + summaries = summarize_peer_cards([peer]) + assert summaries[0]["name"] == "JSON Name" + + +def test_summarize_falls_back_when_agent_card_string_is_malformed(): + peer = _peer(agent_card="not-valid-json") + summaries = summarize_peer_cards([peer]) + assert len(summaries) == 1 + assert summaries[0]["name"] == "DB Name" + assert summaries[0]["role"] == "DB Role" + assert summaries[0]["skills"] == [] + + +def test_summarize_falls_back_when_agent_card_is_wrong_type(): + peer = _peer(agent_card=42) + summaries = summarize_peer_cards([peer]) + assert len(summaries) == 1 + assert summaries[0]["name"] == "DB Name" + + +def test_summarize_handles_missing_role_and_name_with_unknown_default(): + peer = {"id": "ws-2", "status": "active", "agent_card": None} + summaries = summarize_peer_cards([peer]) + assert summaries[0]["name"] == "Unknown" + assert summaries[0]["role"] == "" + + +def test_build_peer_section_renders_role_when_skills_empty(): + section = build_peer_section([_peer()]) + assert "## Your Peers" in section + assert "**DB Name**" in section + assert "Role: DB Role" in section + assert "Skills:" not in section + + +def test_build_peer_section_prefers_skills_over_role_when_card_present(): + peer = _peer( + agent_card={"name": "Worker", "skills": [{"name": "design"}, {"name": "review"}]} + ) + section = build_peer_section([peer]) + assert "Skills: design, review" in section + assert "Role: DB Role" not in section + + +def test_build_peer_section_mixed_peers(): + peers = [ + _peer(id="ws-a"), + _peer( + id="ws-b", + agent_card={"name": "Card B", "skills": [{"name": "build"}]}, + ), + ] + section = build_peer_section(peers) + assert "id: `ws-a`" in section + assert "id: `ws-b`" in section + assert "Role: DB Role" in section + assert "Skills: build" in section + + +def test_build_peer_section_empty_when_no_peers(): + assert build_peer_section([]) == "" From 96acbd719b73dcc8b391f63662d52dac1fe593e9 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 14:15:42 -0700 Subject: [PATCH 4/7] test: update test_peer_capabilities_format for fallback behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous assertion `'Silent Agent' not in result` was pinning the buggy behavior — peers without an agent_card were silently dropped from the prompt. With the fallback to DB name+role those peers are correctly visible. Flip the assertion so the test pins the new (correct) rendering and would catch a regression to the silent-drop behavior. --- workspace/tests/test_prompt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workspace/tests/test_prompt.py b/workspace/tests/test_prompt.py index 0fb4bd98..133a5d7e 100644 --- a/workspace/tests/test_prompt.py +++ b/workspace/tests/test_prompt.py @@ -203,8 +203,11 @@ def test_peer_capabilities_format(tmp_path): assert "**Echo Agent** (id: `peer-1`, status: online)" in result assert "Skills: echo, repeat" in result assert "delegate_to_workspace" in result - # peer-2 has no agent_card so it's skipped - assert "Silent Agent" not in result + # peer-2 has no agent_card but DOES have a DB name + status — must + # still render so coordinators can delegate to freshly-created peers + # whose A2A discovery hasn't populated a card yet (regression of the + # 2026-04-27 Design Director discovery bug). + assert "**Silent Agent** (id: `peer-2`, status: offline)" in result def test_peer_with_json_string_agent_card(tmp_path): From c59715e143c2be5b400d59d2899b5d05ebd4277b Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 14:43:43 -0700 Subject: [PATCH 5/7] =?UTF-8?q?feat(ci):=20auto-sync=20main=20=E2=86=92=20?= =?UTF-8?q?staging=20to=20keep=20staging-as-superset=20invariant?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background `auto-promote-staging.yml` advances main via `git merge --ff-only` + `git push origin main` — clean fast-forward, no merge commit. But manual `staging → main` merges via the GitHub UI / API create a merge commit on main that staging doesn't have. The next `staging → main` PR then evaluates as "BEHIND" because staging is missing that merge commit, requiring a manual `gh pr update-branch` round-trip. This pattern bit twice on 2026-04-28 (PRs #2202 and #2205, both manual bridges to land pipeline fixes themselves). Each needed update-branch + re-CI before they could merge. Annoying and avoidable. What this workflow does Triggered on every push to main (regardless of source: auto-promote, UI merge, API merge, direct push): 1. Check whether main is already in staging's ancestry. If yes, no-op — auto-promote-staging keeps them aligned via ff push, and the no-op case is the steady state. 2. If not (manual merge commit on main, or direct main hotfix): try `git merge --ff-only origin/main` first. Works when staging hasn't diverged with its own commits. 3. If ff fails (staging has its own in-flight feature work): `git merge --no-ff origin/main -m "chore: sync main → staging"`. Absorbs main's tip while keeping staging's own history. 4. Push staging. Loop safety Pushing the synced staging triggers auto-promote-staging.yml, which checks gates on staging's new tip and, if green, ff-pushes staging to main. Since staging now ⊇ main, the resulting push to main is either a no-op (no ref change → no push event fires → auto-sync doesn't re-trigger) or advances main further. In the latter case auto-sync fires once more, sees main already in staging's ancestry, no-ops. Bounded. Conflict handling If the merge step hits conflicts (staging and main diverged with incompatible changes), the workflow fails with a clear summary pointing to manual resolution. This shouldn't happen in practice — staging is the integration branch; conflicts indicate a direct main hotfix touching the same code as in-flight staging work. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/auto-sync-main-to-staging.yml | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 .github/workflows/auto-sync-main-to-staging.yml diff --git a/.github/workflows/auto-sync-main-to-staging.yml b/.github/workflows/auto-sync-main-to-staging.yml new file mode 100644 index 00000000..83156254 --- /dev/null +++ b/.github/workflows/auto-sync-main-to-staging.yml @@ -0,0 +1,136 @@ +name: Auto-sync main → staging + +# Reflects every push to `main` back onto `staging` so the +# staging-as-superset-of-main invariant holds. +# +# Background: +# +# `auto-promote-staging.yml` advances main via `git merge --ff-only` +# + `git push origin main` — that's a clean fast-forward, no merge +# commit. But manual merges of `staging → main` PRs through the +# GitHub UI / API create a merge commit on main that staging +# doesn't have. The next `staging → main` PR then evaluates as +# "BEHIND" because staging is missing that merge commit, requiring +# a manual `gh pr update-branch` round-trip. +# +# This happened twice on 2026-04-28 (PRs #2202, #2205, both manual +# bridges). Each time the bridge needed update-branch + a re-CI +# round before merging. Operationally annoying and avoidable. +# +# This workflow closes the gap automatically: +# +# 1. Push to main fires (regardless of source: auto-promote, UI +# merge, API merge, direct push). +# 2. Check whether main is already in staging's ancestry — if +# yes, no-op (auto-promote-staging already kept them in sync +# via fast-forward). +# 3. If not, try fast-forward staging to main first (works when +# staging hasn't diverged with its own commits). +# 4. If ff fails (staging has commits main doesn't — feature work +# in flight), do a real merge with a "chore: sync" commit so +# staging absorbs main's tip while keeping its own history. +# 5. Push staging. +# +# Loop safety: +# +# Pushing the synced staging triggers `auto-promote-staging.yml`, +# which checks gates on staging's new tip and, if green, ff-pushes +# staging to main. Since staging now == main (ff case) or ⊇ main +# (merge case where promote then advances), the resulting push to +# main is either a no-op (no actual ref change → no push event) or +# advances main further. In the latter case auto-sync fires again, +# sees main already in staging's ancestry, no-ops. No infinite loop. + +on: + push: + branches: [main] + +permissions: + contents: write + +jobs: + sync-staging: + runs-on: ubuntu-latest + steps: + - name: Checkout staging + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: staging + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Configure git author + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + - name: Check if staging already contains main + id: check + run: | + set -euo pipefail + git fetch origin main + if git merge-base --is-ancestor origin/main HEAD; then + echo "needs_sync=false" >> "$GITHUB_OUTPUT" + { + echo "## ✅ No-op" + echo + echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))." + echo "auto-promote-staging or a previous auto-sync run already kept them aligned." + } >> "$GITHUB_STEP_SUMMARY" + else + echo "needs_sync=true" >> "$GITHUB_OUTPUT" + echo "::notice::staging is missing main's tip — sync needed" + fi + + - name: Fast-forward staging → main + if: steps.check.outputs.needs_sync == 'true' + id: ff + run: | + set -euo pipefail + if git merge --ff-only origin/main; then + echo "did_ff=true" >> "$GITHUB_OUTPUT" + echo "::notice::Fast-forwarded staging to origin/main" + else + echo "did_ff=false" >> "$GITHUB_OUTPUT" + echo "::notice::ff failed — staging has its own commits; will create merge" + fi + + - name: Merge main into staging (when ff fails) + if: | + steps.check.outputs.needs_sync == 'true' && + steps.ff.outputs.did_ff != 'true' + run: | + set -euo pipefail + # ff failed because staging has commits main doesn't — typical + # in-flight feature work. Create a merge commit so staging + # absorbs main's tip while keeping its own history. + if ! git merge --no-ff origin/main -m "chore: sync main → staging (auto)"; then + { + echo "## ❌ Conflict" + echo + echo "Auto-merge \`main → staging\` failed with conflicts." + echo "A human needs to resolve manually:" + echo + echo " git checkout staging" + echo " git merge origin/main" + echo " # resolve, commit, push" + } >> "$GITHUB_STEP_SUMMARY" + exit 1 + fi + + - name: Push staging + if: steps.check.outputs.needs_sync == 'true' + run: | + set -euo pipefail + git push origin staging + { + if [ "${{ steps.ff.outputs.did_ff }}" = "true" ]; then + echo "## ✅ staging fast-forwarded" + echo + echo "staging is now at \`$(git rev-parse --short=8 HEAD)\` (== origin/main)." + else + echo "## ✅ staging absorbed main" + echo + echo "staging is now at \`$(git rev-parse --short=8 HEAD)\` with a merge commit absorbing main's tip." + fi + } >> "$GITHUB_STEP_SUMMARY" From 97d5883e76ece0c99886030168d3b1fb22e203a3 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 14:59:23 -0700 Subject: [PATCH 6/7] fix(ci): auto-sync concurrency + cleanup follow-ups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small fixes from the self-review of #2209: 1. **Required: concurrency group.** Two pushes to main in quick succession (manual UI merge then auto-promote-staging's ff-push, or any back-to-back main pushes) would race two auto-sync runs against the same staging branch — second `git push origin staging` fails non-fast-forward, surfacing as a red CI alert for what should be a no-op. Add `concurrency: { group: auto-sync-main-to-staging, cancel-in-progress: false }` so the second run waits for the first and sees its result. 2. **Hygiene: `git merge --abort` on conflict.** The conflict-error path exits 1 with the work tree in a half-merged state. Doesn't affect future runs (each gets a fresh checkout) but is an unpleasant artifact for anyone who shells into the runner. Abort first, then exit. 3. **Doc accuracy: "Loop safety" comment.** The original said the chain terminates because "main is either a no-op or advances further." That's true but understates the actual safety: GitHub Actions explicitly does NOT trigger downstream workflow runs from `GITHUB_TOKEN`-authored pushes. So the loop is impossible by construction, not just by happy coincidence of ref state. Updated the comment to reflect the actual mechanism. Plus a step-name nit: "Fast-forward staging → main" reads as if main is the target. Renamed to "Fast-forward staging to main" for consistency with the workflow's name (main → staging). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/auto-sync-main-to-staging.yml | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/.github/workflows/auto-sync-main-to-staging.yml b/.github/workflows/auto-sync-main-to-staging.yml index 83156254..278c3428 100644 --- a/.github/workflows/auto-sync-main-to-staging.yml +++ b/.github/workflows/auto-sync-main-to-staging.yml @@ -33,13 +33,19 @@ name: Auto-sync main → staging # # Loop safety: # -# Pushing the synced staging triggers `auto-promote-staging.yml`, -# which checks gates on staging's new tip and, if green, ff-pushes -# staging to main. Since staging now == main (ff case) or ⊇ main -# (merge case where promote then advances), the resulting push to -# main is either a no-op (no actual ref change → no push event) or -# advances main further. In the latter case auto-sync fires again, -# sees main already in staging's ancestry, no-ops. No infinite loop. +# `GITHUB_TOKEN`-authored pushes do NOT trigger downstream workflow +# runs by default (GitHub Actions safety). So when this workflow +# pushes the synced staging, `auto-promote-staging.yml` is NOT +# triggered by that push. The next developer push to staging triggers +# auto-promote normally. No loop is even theoretically possible. +# +# Concurrency: +# +# Two pushes to main in quick succession (e.g., manual UI merge +# immediately followed by auto-promote-staging's ff-merge) would +# otherwise race two auto-sync runs against the same staging branch +# — second push fails non-fast-forward. The concurrency group +# serializes them so the second run sees the first's result. on: push: @@ -48,6 +54,10 @@ on: permissions: contents: write +concurrency: + group: auto-sync-main-to-staging + cancel-in-progress: false + jobs: sync-staging: runs-on: ubuntu-latest @@ -82,7 +92,7 @@ jobs: echo "::notice::staging is missing main's tip — sync needed" fi - - name: Fast-forward staging → main + - name: Fast-forward staging to main if: steps.check.outputs.needs_sync == 'true' id: ff run: | @@ -96,15 +106,18 @@ jobs: fi - name: Merge main into staging (when ff fails) - if: | - steps.check.outputs.needs_sync == 'true' && - steps.ff.outputs.did_ff != 'true' + if: steps.check.outputs.needs_sync == 'true' && steps.ff.outputs.did_ff != 'true' run: | set -euo pipefail # ff failed because staging has commits main doesn't — typical # in-flight feature work. Create a merge commit so staging # absorbs main's tip while keeping its own history. if ! git merge --no-ff origin/main -m "chore: sync main → staging (auto)"; then + # Hygiene: leave the work tree clean before failing. Doesn't + # affect future runs (each gets a fresh checkout) but a + # half-merged tree is an unpleasant artifact to debug if + # anyone ever shells into the runner. + git merge --abort || true { echo "## ❌ Conflict" echo From 6638d6e1d7984f5ab057e9b3a423b2592ac807e7 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 15:29:09 -0700 Subject: [PATCH 7/7] feat(ci): SECRET_PATTERNS drift lint across known consumers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a lint that diffs the canonical SECRET_PATTERNS array in .github/workflows/secret-scan.yml against every known public consumer mirror, failing on any divergence. Why: every side that scans for credentials carries its own copy of the pattern list. They drift — most recently the workspace-runtime pre-commit hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088 vector), so a developer's local pre-commit would let a sk-cp- token through while the org-wide CI scan would refuse it. Useless friction; automated detection closes the gap. Implementation: .github/scripts/lint_secret_pattern_drift.py — pure stdlib, fetches each consumer's RAW file via urllib, extracts the SECRET_PATTERNS=( ... ) array via anchored regex (the closing `)` is anchored to the start of a line because pattern comments like `# GitHub PAT (classic)` contain their own paren mid-line), diffs against canonical, fails on missing or extra patterns. Fetch failures are warnings, not errors — a consumer whose branch was renamed shouldn't fail the lint until someone updates the URL list. .github/workflows/secret-pattern-drift.yml — daily 05:00 UTC cron + on-push gate (when canonical, the workflow, or the script changes) + workflow_dispatch. Read-only token, 5-minute timeout. Initial consumer set: workspace-runtime's bundled pre-commit hook (the one that drifted on sk-cp-). molecule-controlplane's inlined copy is private so this workflow can't read it; that's tracked separately and the controlplane's own self-monitor is the gap. Verified locally: lint detects drift correctly when the runtime hook is missing sk-cp-, returns clean when aligned. Refs: task #139. --- .github/scripts/lint_secret_pattern_drift.py | 134 +++++++++++++++++++ .github/workflows/secret-pattern-drift.yml | 57 ++++++++ 2 files changed, 191 insertions(+) create mode 100644 .github/scripts/lint_secret_pattern_drift.py create mode 100644 .github/workflows/secret-pattern-drift.yml diff --git a/.github/scripts/lint_secret_pattern_drift.py b/.github/scripts/lint_secret_pattern_drift.py new file mode 100644 index 00000000..6c1b7965 --- /dev/null +++ b/.github/scripts/lint_secret_pattern_drift.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""Lint SECRET_PATTERNS drift across known consumers of molecule-core's canonical. + +The canonical SECRET_PATTERNS array in +.github/workflows/secret-scan.yml is mirrored by every other side +that scans for credentials: the workspace-runtime's bundled +pre-commit hook, the molecule-controlplane inlined copy, etc. The +mirror is enforced socially today — when someone adds a new pattern +to canonical (e.g. the sk-cp- MiniMax token after F1088), the other +sides are supposed to be updated in lockstep. + +This script automates the check. Diffs the canonical's pattern set +against each known public consumer and exits non-zero on any +mismatch. Wired into a daily cron + on-push gate via +.github/workflows/secret-pattern-drift.yml. + +Private-repo consumers (currently molecule-controlplane's inlined +copy) are out of scope here because the molecule-core workflow's +GITHUB_TOKEN can't read other private repos in the org. They're +expected to self-monitor via their own copy of this script — not a +hard barrier, just a future expansion. +""" + +from __future__ import annotations + +import re +import sys +import urllib.request +from pathlib import Path + +CANONICAL_FILE = Path(".github/workflows/secret-scan.yml") + +# Public consumer mirrors. Each entry is (label, raw_url) — raw_url +# points at the file's RAW content on the consumer's default branch +# (or staging where applicable). Add an entry here when a new public +# repo starts shipping its own SECRET_PATTERNS array. +CONSUMERS: list[tuple[str, str]] = [ + ( + "molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh", + "https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh", + ), +] + +# Matches the SECRET_PATTERNS=( ... ) array in either yaml-indented +# (the canonical workflow's `run:` block) or shell-flat (runtime +# hook) format. Patterns inside are single-quoted Bash strings; we +# pull each via _PATTERN_RE. +# +# Closing `)` is anchored to the start of a line (possibly indented) +# because pattern comments like `# GitHub PAT (classic)` contain +# their own `)` mid-line — a non-anchored regex would match through +# the comment's paren and capture only the first pattern. +_ARRAY_RE = re.compile(r"SECRET_PATTERNS=\((.*?)^\s*\)", re.DOTALL | re.MULTILINE) +_PATTERN_RE = re.compile(r"'([^']+)'") + + +def extract_patterns(content: str, source_label: str) -> list[str]: + """Pull the SECRET_PATTERNS list out of either format. Raises if missing.""" + m = _ARRAY_RE.search(content) + if not m: + raise SystemExit(f"::error::{source_label}: SECRET_PATTERNS=(...) array not found") + return _PATTERN_RE.findall(m.group(1)) + + +def fetch(url: str) -> str: + req = urllib.request.Request( + url, headers={"User-Agent": "secret-pattern-drift-lint/1"} + ) + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read().decode("utf-8") + + +def diff_patterns(canonical: list[str], consumer: list[str]) -> tuple[list[str], list[str]]: + """Return (missing_from_consumer, extra_in_consumer) — both sorted.""" + canonical_set = set(canonical) + consumer_set = set(consumer) + return ( + sorted(canonical_set - consumer_set), + sorted(consumer_set - canonical_set), + ) + + +def main() -> int: + if not CANONICAL_FILE.exists(): + print(f"::error::canonical not found at {CANONICAL_FILE}") + return 1 + + canonical = extract_patterns(CANONICAL_FILE.read_text(), str(CANONICAL_FILE)) + print(f"canonical ({CANONICAL_FILE}): {len(canonical)} patterns") + + drift = False + for label, url in CONSUMERS: + try: + content = fetch(url) + except Exception as e: + # Fetch failures are warnings, not errors. A consumer + # whose default branch was just renamed (or whose file + # moved) shouldn't fail the lint until someone updates + # the URL above. Real drift is the failure mode this + # gate exists to catch — fetch reliability isn't. + print(f"::warning::{label}: fetch failed ({e}) — skipping") + continue + + consumer = extract_patterns(content, label) + missing, extra = diff_patterns(canonical, consumer) + if not missing and not extra: + print(f" ✓ {label}: aligned ({len(consumer)} patterns)") + continue + + drift = True + print(f"::error::DRIFT in {label}:") + for p in missing: + print(f" - missing from consumer: {p!r}") + for p in extra: + print(f" - extra in consumer (not in canonical): {p!r}") + + if drift: + print() + print("::error::SECRET_PATTERNS drift detected. Bring consumer(s) into") + print("alignment with the canonical SECRET_PATTERNS array in") + print(f"{CANONICAL_FILE} by adding the missing patterns and removing") + print("any extras. The two sides must stay byte-aligned on the pattern") + print("list — the runtime hook is the developer's local pre-commit,") + print("the canonical is the org-wide CI gate, divergence means a token") + print("can pass one but get rejected by the other.") + return 1 + + print() + print("✓ All known consumers aligned with canonical SECRET_PATTERNS.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/workflows/secret-pattern-drift.yml b/.github/workflows/secret-pattern-drift.yml new file mode 100644 index 00000000..554bab35 --- /dev/null +++ b/.github/workflows/secret-pattern-drift.yml @@ -0,0 +1,57 @@ +name: SECRET_PATTERNS drift lint + +# Detects when the canonical SECRET_PATTERNS array in +# .github/workflows/secret-scan.yml diverges from known consumer +# mirrors (workspace-runtime's bundled pre-commit hook today; more +# can be added as the consumer set grows). +# +# Why this exists: every side that scans for credentials has its own +# copy of the pattern list. They drift — most recently the runtime +# hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088), +# so a developer's local pre-commit would let a sk-cp- token through +# while the org-wide CI scan would refuse it. The cost of that drift +# is dev confusion + delayed feedback; the fix is automated detection. +# +# Triggers: +# - schedule: daily 05:00 UTC. Catches drift introduced by edits +# to a consumer copy that didn't update canonical here. +# - push to main/staging where the canonical or this lint changed: +# catches the inverse — canonical updated but consumers not yet +# bumped. The lint will fail the push; that's intentional, the +# person editing canonical is the right person to also update +# the consumer. +# - workflow_dispatch: ad-hoc operator runs. + +on: + schedule: + # 05:00 UTC = 22:00 PT / 01:00 ET. Quiet hours so a failure + # email lands when humans are starting their day, not + # interrupting it. + - cron: "0 5 * * *" + push: + branches: [main, staging] + paths: + - ".github/workflows/secret-scan.yml" + - ".github/workflows/secret-pattern-drift.yml" + - ".github/scripts/lint_secret_pattern_drift.py" + workflow_dispatch: + +# GITHUB_TOKEN scoped to read-only. The lint only does git checkout +# + HTTPS GETs to public consumer files; no writes to anything. +permissions: + contents: read + +jobs: + lint: + name: Detect SECRET_PATTERNS drift + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Run drift lint + run: python3 .github/scripts/lint_secret_pattern_drift.py