Merge pull request #2211 from Molecule-AI/staging

staging to main
This commit is contained in:
Hongming Wang 2026-04-28 15:52:20 -07:00 committed by GitHub
commit a3864eaf3d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 616 additions and 19 deletions

View File

@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""Lint SECRET_PATTERNS drift across known consumers of molecule-core's canonical.
The canonical SECRET_PATTERNS array in
.github/workflows/secret-scan.yml is mirrored by every other side
that scans for credentials: the workspace-runtime's bundled
pre-commit hook, the molecule-controlplane inlined copy, etc. The
mirror is enforced socially today when someone adds a new pattern
to canonical (e.g. the sk-cp- MiniMax token after F1088), the other
sides are supposed to be updated in lockstep.
This script automates the check. Diffs the canonical's pattern set
against each known public consumer and exits non-zero on any
mismatch. Wired into a daily cron + on-push gate via
.github/workflows/secret-pattern-drift.yml.
Private-repo consumers (currently molecule-controlplane's inlined
copy) are out of scope here because the molecule-core workflow's
GITHUB_TOKEN can't read other private repos in the org. They're
expected to self-monitor via their own copy of this script not a
hard barrier, just a future expansion.
"""
from __future__ import annotations
import re
import sys
import urllib.request
from pathlib import Path
CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
# Public consumer mirrors. Each entry is (label, raw_url) — raw_url
# points at the file's RAW content on the consumer's default branch
# (or staging where applicable). Add an entry here when a new public
# repo starts shipping its own SECRET_PATTERNS array.
CONSUMERS: list[tuple[str, str]] = [
(
"molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh",
"https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh",
),
]
# Matches the SECRET_PATTERNS=( ... ) array in either yaml-indented
# (the canonical workflow's `run:` block) or shell-flat (runtime
# hook) format. Patterns inside are single-quoted Bash strings; we
# pull each via _PATTERN_RE.
#
# Closing `)` is anchored to the start of a line (possibly indented)
# because pattern comments like `# GitHub PAT (classic)` contain
# their own `)` mid-line — a non-anchored regex would match through
# the comment's paren and capture only the first pattern.
_ARRAY_RE = re.compile(r"SECRET_PATTERNS=\((.*?)^\s*\)", re.DOTALL | re.MULTILINE)
_PATTERN_RE = re.compile(r"'([^']+)'")
def extract_patterns(content: str, source_label: str) -> list[str]:
"""Pull the SECRET_PATTERNS list out of either format. Raises if missing."""
m = _ARRAY_RE.search(content)
if not m:
raise SystemExit(f"::error::{source_label}: SECRET_PATTERNS=(...) array not found")
return _PATTERN_RE.findall(m.group(1))
def fetch(url: str) -> str:
req = urllib.request.Request(
url, headers={"User-Agent": "secret-pattern-drift-lint/1"}
)
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8")
def diff_patterns(canonical: list[str], consumer: list[str]) -> tuple[list[str], list[str]]:
"""Return (missing_from_consumer, extra_in_consumer) — both sorted."""
canonical_set = set(canonical)
consumer_set = set(consumer)
return (
sorted(canonical_set - consumer_set),
sorted(consumer_set - canonical_set),
)
def main() -> int:
if not CANONICAL_FILE.exists():
print(f"::error::canonical not found at {CANONICAL_FILE}")
return 1
canonical = extract_patterns(CANONICAL_FILE.read_text(), str(CANONICAL_FILE))
print(f"canonical ({CANONICAL_FILE}): {len(canonical)} patterns")
drift = False
for label, url in CONSUMERS:
try:
content = fetch(url)
except Exception as e:
# Fetch failures are warnings, not errors. A consumer
# whose default branch was just renamed (or whose file
# moved) shouldn't fail the lint until someone updates
# the URL above. Real drift is the failure mode this
# gate exists to catch — fetch reliability isn't.
print(f"::warning::{label}: fetch failed ({e}) — skipping")
continue
consumer = extract_patterns(content, label)
missing, extra = diff_patterns(canonical, consumer)
if not missing and not extra:
print(f"{label}: aligned ({len(consumer)} patterns)")
continue
drift = True
print(f"::error::DRIFT in {label}:")
for p in missing:
print(f" - missing from consumer: {p!r}")
for p in extra:
print(f" - extra in consumer (not in canonical): {p!r}")
if drift:
print()
print("::error::SECRET_PATTERNS drift detected. Bring consumer(s) into")
print("alignment with the canonical SECRET_PATTERNS array in")
print(f"{CANONICAL_FILE} by adding the missing patterns and removing")
print("any extras. The two sides must stay byte-aligned on the pattern")
print("list — the runtime hook is the developer's local pre-commit,")
print("the canonical is the org-wide CI gate, divergence means a token")
print("can pass one but get rejected by the other.")
return 1
print()
print("✓ All known consumers aligned with canonical SECRET_PATTERNS.")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,114 @@
name: Auto-promote :latest on E2E green
# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-<sha>`
# → `:latest` whenever E2E Staging SaaS passes for a `main` push.
#
# This is the doc-aligned alternative to the (deferred) Phase 2 canary
# fleet — staging E2E catches ~90% of what canary would catch at 0%
# ongoing infra cost. See `molecule-controlplane/docs/canary-tenants.md`
# section "Do we actually need canary right now?" — recommended
# sequencing for the current scale (≤20 paying tenants).
#
# Why a separate workflow rather than folding into e2e-staging-saas.yml:
# - Keeps test concerns separate from release concerns.
# - Disabling promote (e.g. during an incident) is one toggle, not an
# edit to the long E2E workflow file.
# - When Phase 2 canary work eventually lands, the canary path can
# replace this file's trigger without touching the E2E workflow.
#
# Why trigger on `main` only:
# - `:latest` is what prod tenants pull. We only want SHAs that have
# reached `main` (via auto-promote-staging) to advance `:latest`.
# - Triggering on staging would let a staging-only revert advance
# `:latest` to a SHA that never reaches `main`, breaking the
# "production runs what's on `main`" invariant.
on:
workflow_run:
workflows: ['E2E Staging SaaS (full lifecycle)']
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
sha:
description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)'
required: false
type: string
permissions:
contents: read
packages: write
env:
IMAGE_NAME: ghcr.io/molecule-ai/platform
TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
jobs:
promote:
# Skip if E2E failed — `:latest` stays on the prior known-good
# digest. Manual dispatch always proceeds (the operator already
# decided to promote).
if: |
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
runs-on: ubuntu-latest
steps:
- name: Compute short sha
id: sha
run: |
set -euo pipefail
if [ -n "${{ github.event.inputs.sha }}" ]; then
FULL="${{ github.event.inputs.sha }}"
else
FULL="${{ github.event.workflow_run.head_sha }}"
fi
echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT"
echo "full=${FULL}" >> "$GITHUB_OUTPUT"
- uses: imjasonh/setup-crane@v0.4
- name: GHCR login
run: |
echo "${{ secrets.GITHUB_TOKEN }}" | \
crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
- name: Verify :staging-<sha> exists for both images
# Better to fail fast with a clear message than to half-tag
# (platform retagged but platform-tenant missing → tenants pull
# a stale image).
run: |
set -euo pipefail
for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do
tag="${img}:staging-${{ steps.sha.outputs.short }}"
if ! crane manifest "$tag" >/dev/null 2>&1; then
echo "::error::Missing tag: $tag"
echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote-on-e2e can retag :latest."
exit 1
fi
echo " ok: $tag exists"
done
- name: Retag platform :staging-<sha> → :latest
run: |
crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
- name: Retag tenant :staging-<sha> → :latest
run: |
crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
- name: Summary
run: |
{
echo "## E2E green → :latest promoted"
echo
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "- Trigger: manual dispatch"
else
echo "- Upstream E2E run: ${{ github.event.workflow_run.html_url }}"
fi
echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest"
echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest"
echo
echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true."
echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml."
} >> "$GITHUB_STEP_SUMMARY"

View File

@ -0,0 +1,149 @@
name: Auto-sync main → staging
# Reflects every push to `main` back onto `staging` so the
# staging-as-superset-of-main invariant holds.
#
# Background:
#
# `auto-promote-staging.yml` advances main via `git merge --ff-only`
# + `git push origin main` — that's a clean fast-forward, no merge
# commit. But manual merges of `staging → main` PRs through the
# GitHub UI / API create a merge commit on main that staging
# doesn't have. The next `staging → main` PR then evaluates as
# "BEHIND" because staging is missing that merge commit, requiring
# a manual `gh pr update-branch` round-trip.
#
# This happened twice on 2026-04-28 (PRs #2202, #2205, both manual
# bridges). Each time the bridge needed update-branch + a re-CI
# round before merging. Operationally annoying and avoidable.
#
# This workflow closes the gap automatically:
#
# 1. Push to main fires (regardless of source: auto-promote, UI
# merge, API merge, direct push).
# 2. Check whether main is already in staging's ancestry — if
# yes, no-op (auto-promote-staging already kept them in sync
# via fast-forward).
# 3. If not, try fast-forward staging to main first (works when
# staging hasn't diverged with its own commits).
# 4. If ff fails (staging has commits main doesn't — feature work
# in flight), do a real merge with a "chore: sync" commit so
# staging absorbs main's tip while keeping its own history.
# 5. Push staging.
#
# Loop safety:
#
# `GITHUB_TOKEN`-authored pushes do NOT trigger downstream workflow
# runs by default (GitHub Actions safety). So when this workflow
# pushes the synced staging, `auto-promote-staging.yml` is NOT
# triggered by that push. The next developer push to staging triggers
# auto-promote normally. No loop is even theoretically possible.
#
# Concurrency:
#
# Two pushes to main in quick succession (e.g., manual UI merge
# immediately followed by auto-promote-staging's ff-merge) would
# otherwise race two auto-sync runs against the same staging branch
# — second push fails non-fast-forward. The concurrency group
# serializes them so the second run sees the first's result.
on:
push:
branches: [main]
permissions:
contents: write
concurrency:
group: auto-sync-main-to-staging
cancel-in-progress: false
jobs:
sync-staging:
runs-on: ubuntu-latest
steps:
- name: Checkout staging
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: staging
token: ${{ secrets.GITHUB_TOKEN }}
- name: Configure git author
run: |
git config user.name "github-actions[bot]"
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
- name: Check if staging already contains main
id: check
run: |
set -euo pipefail
git fetch origin main
if git merge-base --is-ancestor origin/main HEAD; then
echo "needs_sync=false" >> "$GITHUB_OUTPUT"
{
echo "## ✅ No-op"
echo
echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))."
echo "auto-promote-staging or a previous auto-sync run already kept them aligned."
} >> "$GITHUB_STEP_SUMMARY"
else
echo "needs_sync=true" >> "$GITHUB_OUTPUT"
echo "::notice::staging is missing main's tip — sync needed"
fi
- name: Fast-forward staging to main
if: steps.check.outputs.needs_sync == 'true'
id: ff
run: |
set -euo pipefail
if git merge --ff-only origin/main; then
echo "did_ff=true" >> "$GITHUB_OUTPUT"
echo "::notice::Fast-forwarded staging to origin/main"
else
echo "did_ff=false" >> "$GITHUB_OUTPUT"
echo "::notice::ff failed — staging has its own commits; will create merge"
fi
- name: Merge main into staging (when ff fails)
if: steps.check.outputs.needs_sync == 'true' && steps.ff.outputs.did_ff != 'true'
run: |
set -euo pipefail
# ff failed because staging has commits main doesn't — typical
# in-flight feature work. Create a merge commit so staging
# absorbs main's tip while keeping its own history.
if ! git merge --no-ff origin/main -m "chore: sync main → staging (auto)"; then
# Hygiene: leave the work tree clean before failing. Doesn't
# affect future runs (each gets a fresh checkout) but a
# half-merged tree is an unpleasant artifact to debug if
# anyone ever shells into the runner.
git merge --abort || true
{
echo "## ❌ Conflict"
echo
echo "Auto-merge \`main → staging\` failed with conflicts."
echo "A human needs to resolve manually:"
echo
echo " git checkout staging"
echo " git merge origin/main"
echo " # resolve, commit, push"
} >> "$GITHUB_STEP_SUMMARY"
exit 1
fi
- name: Push staging
if: steps.check.outputs.needs_sync == 'true'
run: |
set -euo pipefail
git push origin staging
{
if [ "${{ steps.ff.outputs.did_ff }}" = "true" ]; then
echo "## ✅ staging fast-forwarded"
echo
echo "staging is now at \`$(git rev-parse --short=8 HEAD)\` (== origin/main)."
else
echo "## ✅ staging absorbed main"
echo
echo "staging is now at \`$(git rev-parse --short=8 HEAD)\` with a merge commit absorbing main's tip."
fi
} >> "$GITHUB_STEP_SUMMARY"

View File

@ -0,0 +1,57 @@
name: SECRET_PATTERNS drift lint
# Detects when the canonical SECRET_PATTERNS array in
# .github/workflows/secret-scan.yml diverges from known consumer
# mirrors (workspace-runtime's bundled pre-commit hook today; more
# can be added as the consumer set grows).
#
# Why this exists: every side that scans for credentials has its own
# copy of the pattern list. They drift — most recently the runtime
# hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088),
# so a developer's local pre-commit would let a sk-cp- token through
# while the org-wide CI scan would refuse it. The cost of that drift
# is dev confusion + delayed feedback; the fix is automated detection.
#
# Triggers:
# - schedule: daily 05:00 UTC. Catches drift introduced by edits
# to a consumer copy that didn't update canonical here.
# - push to main/staging where the canonical or this lint changed:
# catches the inverse — canonical updated but consumers not yet
# bumped. The lint will fail the push; that's intentional, the
# person editing canonical is the right person to also update
# the consumer.
# - workflow_dispatch: ad-hoc operator runs.
on:
schedule:
# 05:00 UTC = 22:00 PT / 01:00 ET. Quiet hours so a failure
# email lands when humans are starting their day, not
# interrupting it.
- cron: "0 5 * * *"
push:
branches: [main, staging]
paths:
- ".github/workflows/secret-scan.yml"
- ".github/workflows/secret-pattern-drift.yml"
- ".github/scripts/lint_secret_pattern_drift.py"
workflow_dispatch:
# GITHUB_TOKEN scoped to read-only. The lint only does git checkout
# + HTTPS GETs to public consumer files; no writes to anything.
permissions:
contents: read
jobs:
lint:
name: Detect SECRET_PATTERNS drift
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Run drift lint
run: python3 .github/scripts/lint_secret_pattern_drift.py

View File

@ -148,7 +148,13 @@ jobs:
SELF=".github/workflows/secret-scan.yml"
OFFENDING=""
for f in $CHANGED; do
# `while IFS= read -r` (not `for f in $CHANGED`) so filenames
# containing whitespace don't word-split silently — a path
# with a space would otherwise produce two iterations on
# tokens that aren't real filenames, breaking the
# self-exclude + diff lookup.
while IFS= read -r f; do
[ -z "$f" ] && continue
[ "$f" = "$SELF" ] && continue
if [ -n "$DIFF_RANGE" ]; then
ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
@ -164,11 +170,18 @@ jobs:
break
fi
done
done
done <<< "$CHANGED"
if [ -n "$OFFENDING" ]; then
echo "::error::Credential-shaped strings detected in diff additions:"
printf "$OFFENDING"
# `printf '%b' "$OFFENDING"` interprets backslash escapes
# (the literal `\n` we appended above becomes a newline)
# WITHOUT treating OFFENDING as a format string. Plain
# `printf "$OFFENDING"` is a format-string sink: a filename
# containing `%` would be interpreted as a conversion
# specifier, corrupting the error message (or printing
# `%(missing)` artifacts).
printf '%b' "$OFFENDING"
echo ""
echo "The actual matched values are NOT echoed here, deliberately —"
echo "round-tripping a leaked credential into CI logs widens the blast"

View File

@ -2,6 +2,7 @@
from __future__ import annotations
import json
from typing import Any
from a2a.server.agent_execution import RequestContext
@ -89,33 +90,46 @@ def append_peer_guidance(
def summarize_peer_cards(peers: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Return compact peer metadata for prompt rendering."""
"""Return compact peer metadata for prompt rendering.
Falls back to the registry row's `name` and `role` when `agent_card` is
null or unparseable so peers stay visible to delegators even before
their A2A discovery roundtrip has populated a card. Without this
fallback a coordinator-tier workspace with N freshly-created worker
peers would render an empty `## Your Peers` section and refuse to
delegate (the regression behind the 2026-04-27 Design Director
discovery bug).
"""
summaries: list[dict[str, Any]] = []
for peer in peers:
agent_card = peer.get("agent_card")
if not agent_card:
continue
if isinstance(agent_card, str):
try:
import json
agent_card = json.loads(agent_card)
except Exception:
continue
agent_card = None
if not isinstance(agent_card, dict):
continue
agent_card = None
if agent_card:
skills_raw = agent_card.get("skills") or []
skills = [
s.get("name", s.get("id", ""))
for s in skills_raw
if isinstance(s, dict)
]
name = agent_card.get("name") or peer.get("name") or "Unknown"
else:
skills = []
name = peer.get("name") or "Unknown"
skills = agent_card.get("skills", [])
summaries.append(
{
"id": peer.get("id", "unknown"),
"name": agent_card.get("name", peer.get("name", "Unknown")),
"name": name,
"role": peer.get("role") or "",
"status": peer.get("status", "unknown"),
"skills": [
s.get("name", s.get("id", ""))
for s in skills
if isinstance(s, dict)
],
"skills": skills,
}
)
return summaries
@ -140,6 +154,8 @@ def build_peer_section(
parts.append(f"- **{peer['name']}** (id: `{peer['id']}`, status: {peer['status']})")
if peer["skills"]:
parts.append(f" Skills: {', '.join(peer['skills'])}")
elif peer.get("role"):
parts.append(f" Role: {peer['role']}")
parts.append("")
parts.append(instruction)
return "\n".join(parts)

View File

@ -203,8 +203,11 @@ def test_peer_capabilities_format(tmp_path):
assert "**Echo Agent** (id: `peer-1`, status: online)" in result
assert "Skills: echo, repeat" in result
assert "delegate_to_workspace" in result
# peer-2 has no agent_card so it's skipped
assert "Silent Agent" not in result
# peer-2 has no agent_card but DOES have a DB name + status — must
# still render so coordinators can delegate to freshly-created peers
# whose A2A discovery hasn't populated a card yet (regression of the
# 2026-04-27 Design Director discovery bug).
assert "**Silent Agent** (id: `peer-2`, status: offline)" in result
def test_peer_with_json_string_agent_card(tmp_path):

View File

@ -0,0 +1,111 @@
"""Pin peer-summary fallback when agent_card is missing.
Regresses the 2026-04-27 Design Director discovery bug:
`summarize_peer_cards()` previously skipped any peer whose `agent_card`
was null or unparseable, so a coordinator with freshly-created workers
saw an empty `## Your Peers` section in its system prompt and refused
to delegate. The registry endpoint already returns DB `name` + `role`
on every row regardless of agent_card state falling back to those
keeps peers visible while A2A discovery catches up.
"""
from __future__ import annotations
from shared_runtime import build_peer_section, summarize_peer_cards
def _peer(**overrides):
base = {
"id": "ws-1",
"name": "DB Name",
"role": "DB Role",
"status": "active",
"agent_card": None,
}
base.update(overrides)
return base
def test_summarize_includes_peer_with_null_agent_card_using_db_fields():
summaries = summarize_peer_cards([_peer()])
assert len(summaries) == 1
assert summaries[0]["id"] == "ws-1"
assert summaries[0]["name"] == "DB Name"
assert summaries[0]["role"] == "DB Role"
assert summaries[0]["status"] == "active"
assert summaries[0]["skills"] == []
def test_summarize_prefers_agent_card_name_over_db_name():
peer = _peer(
agent_card={"name": "Card Name", "skills": [{"name": "draft-spec"}]}
)
summaries = summarize_peer_cards([peer])
assert summaries[0]["name"] == "Card Name"
assert summaries[0]["skills"] == ["draft-spec"]
assert summaries[0]["role"] == "DB Role"
def test_summarize_handles_string_agent_card_json():
peer = _peer(agent_card='{"name": "JSON Name", "skills": []}')
summaries = summarize_peer_cards([peer])
assert summaries[0]["name"] == "JSON Name"
def test_summarize_falls_back_when_agent_card_string_is_malformed():
peer = _peer(agent_card="not-valid-json")
summaries = summarize_peer_cards([peer])
assert len(summaries) == 1
assert summaries[0]["name"] == "DB Name"
assert summaries[0]["role"] == "DB Role"
assert summaries[0]["skills"] == []
def test_summarize_falls_back_when_agent_card_is_wrong_type():
peer = _peer(agent_card=42)
summaries = summarize_peer_cards([peer])
assert len(summaries) == 1
assert summaries[0]["name"] == "DB Name"
def test_summarize_handles_missing_role_and_name_with_unknown_default():
peer = {"id": "ws-2", "status": "active", "agent_card": None}
summaries = summarize_peer_cards([peer])
assert summaries[0]["name"] == "Unknown"
assert summaries[0]["role"] == ""
def test_build_peer_section_renders_role_when_skills_empty():
section = build_peer_section([_peer()])
assert "## Your Peers" in section
assert "**DB Name**" in section
assert "Role: DB Role" in section
assert "Skills:" not in section
def test_build_peer_section_prefers_skills_over_role_when_card_present():
peer = _peer(
agent_card={"name": "Worker", "skills": [{"name": "design"}, {"name": "review"}]}
)
section = build_peer_section([peer])
assert "Skills: design, review" in section
assert "Role: DB Role" not in section
def test_build_peer_section_mixed_peers():
peers = [
_peer(id="ws-a"),
_peer(
id="ws-b",
agent_card={"name": "Card B", "skills": [{"name": "build"}]},
),
]
section = build_peer_section(peers)
assert "id: `ws-a`" in section
assert "id: `ws-b`" in section
assert "Role: DB Role" in section
assert "Skills: build" in section
def test_build_peer_section_empty_when_no_peers():
assert build_peer_section([]) == ""