ci(e2e-staging): promote E2E Staging Platform Boot to merge-blocking (fail-closed) — #48 #3116

Merged
core-devops merged 1 commits from harden/platform-boot-merge-blocking into main 2026-06-21 09:32:37 +00:00
3 changed files with 81 additions and 18 deletions
+7
View File
@@ -14,3 +14,10 @@ E2E Peer Visibility (literal MCP list_peers) / E2E Peer Visibility
Secret scan / Scan diff for credential-shaped strings
template-delivery-e2e / Template-asset delivery (fresh seo-agent — config+prompts via asset channel, seo-all via plugin reconcile)
E2E Staging SaaS (full lifecycle) / E2E Staging Concierge Creates Workspace
# #48 (RCA molecule-controlplane #878→#885): real platform-managed boot is now
# merge-blocking. SSOT updated here in the same PR that removed continue-on-error
# (lint-no-coe-on-required forbids CoE on any listed context). REMAINING OWNER
# ACTION: add "E2E Staging SaaS (full lifecycle) / E2E Staging Platform Boot
# (pull_request)" to branch_protections/main.status_check_contexts AFTER this PR
# merges (allowlist-superset-of-BP is lint-clean; BP-superset-of-allowlist is not).
E2E Staging SaaS (full lifecycle) / E2E Staging Platform Boot
+43 -17
View File
@@ -369,22 +369,32 @@ jobs:
# honest (BYOK requires a key; platform requires its ABSENCE not to matter) and
# gives the regression its own named commit-status for branch protection.
#
# Add `E2E Staging Platform Boot` to branch protection after 3 consecutive
# green runs on main (de-flake window; this path shares the cp#245
# boot-timeout flake surface the BYOK job has, so it must prove stable before
# it can BLOCK — see the gate-making plan in the PR body).
# bp-required: pending #2187
# GATING (no continue-on-error), FALSE-GREEN-PROOF via E2E_REQUIRE_LIVE
# (0 on pull_request → PR-mode self-check; 1 on push/dispatch/cron → real
# staging boot, HARD FAILs on missing infra). Promoted to merge-blocking
# per #48 — RCA: molecule-controlplane PR #878 rendered the tenant
# `docker run` env block with a blank line that broke shell `\`-continuation,
# orphaning the image arg → `docker run exit=127` → no tenant container →
# prod onboarding outage 06:0408:09 UTC 2026-06-21 (fixed by CP #885). The
# real-boot e2e being advisory + PR-skipped is why that class escaped
# pre-merge. This job now exercises a real platform-managed boot on every
# push-to-main and is the merge gate.
#
# bp-required: now required — added to .gitea/required-contexts.txt by #48
#
# core#3081 / #48: NO `if:` guard on this job (mirrors
# e2e-staging-concierge-creates-workspace). The job IS a required status
# context (see .gitea/required-contexts.txt); a required context that never
# fires on pull_request degrades the merge gate to a silent indefinite
# pending (the exact failure mode lint-required-no-paths exists to prevent;
# see feedback_path_filtered_workflow_cant_be_required). The job runs on
# every PR with E2E_REQUIRE_LIVE=0 (the harness detects the missing-creds
# case and exit 0s after a bash -n self-check), and on push/dispatch/cron
# with E2E_REQUIRE_LIVE=1 (the real staging boot runs and HARD FAILs on
# missing infra).
e2e-staging-platform-boot:
name: E2E Staging Platform Boot
runs-on: ubuntu-latest
# core#3081: gate the slow platform-boot job to push/dispatch/cron now
# that the workflow's `paths:` filter has been removed (lint-required-no-paths
# compliance). Matches the pattern of the other slow jobs in this workflow.
if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
# Phase 3 (RFC #219 §1): surface without blocking until the de-flake window
# closes. mc#2654: do NOT renew this mask silently — the gate-making plan
# tracks the flip to false under #2187.
continue-on-error: true
timeout-minutes: 45
permissions:
contents: read
@@ -410,17 +420,33 @@ jobs:
E2E_MODE: smoke
E2E_RUN_ID: "platform-${{ github.run_id }}-${{ github.run_attempt }}"
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
# Fail-closed-on-skip (see BYOK job). smoke mode still runs steps 2/4/7/8b,
# so all four required milestones (provisioned/tenant_online/
# workspace_online/a2a_roundtrip) fire — the guard is valid for this lane too.
E2E_REQUIRE_LIVE: '1'
# Fail-closed-on-skip (see BYOK job + e2e-staging-concierge-creates-workspace).
# smoke mode still runs steps 2/4/7/8b, so all four required milestones
# (provisioned/tenant_online/workspace_online/a2a_roundtrip) fire — the guard
# is valid for this lane too.
# pull_request: 0 → PRs have no staging creds; the harness's PR-mode
# self-check (bash -n) is the gate, then exit 0.
# push / dispatch / schedule: 1 → the real staging boot runs and HARD
# FAILs (exit 5) on a run that proves no live lifecycle.
E2E_REQUIRE_LIVE: ${{ github.event_name == 'pull_request' && '0' || '1' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify admin token present
env:
E2E_REQUIRE_LIVE: ${{ github.event_name == 'pull_request' && '0' || '1' }}
run: |
# PR-mode (#48): on pull_request the job runs with E2E_REQUIRE_LIVE=0
# and PRs carry no staging creds. Don't hard-fail here — the harness
# detects the missing-creds case, runs a bash -n self-check, and
# exit 0s. On push/dispatch/cron (E2E_REQUIRE_LIVE=1) the creds MUST
# be present and a missing token/AWS-cred is a hard error.
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
if [ "${E2E_REQUIRE_LIVE}" = "0" ]; then
echo "PR-mode: no MOLECULE_ADMIN_TOKEN (E2E_REQUIRE_LIVE=0) — harness will self-check and skip the live boot ✓"
exit 0
fi
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
exit 2
fi
+31 -1
View File
@@ -123,7 +123,11 @@
set -euo pipefail
CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN requiredRailway staging CP_ADMIN_API_TOKEN}"
# #48: tolerate an absent admin token here — the PR-mode early-exit below
# (E2E_REQUIRE_LIVE=0 + no token) handles the pull_request lane cleanly. On a
# real run (push/dispatch/cron, E2E_REQUIRE_LIVE=1) the missing-token case is
# caught as a HARD FAIL just past the PR-mode block, with a clear message.
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:-}"
RUNTIME="${E2E_RUNTIME:-hermes}"
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}"
@@ -216,6 +220,32 @@ require_live_or_die() {
fi
}
# ─── PR-mode early-exit (#48 — mirrors test_staging_concierge_creates_workspace_e2e.sh) ──
# This harness is invoked by TWO jobs in e2e-staging-saas.yml:
# - e2e-staging-saas (push/dispatch/cron only; always has creds + REQUIRE_LIVE=1)
# - e2e-staging-platform-boot (now ALSO pull_request; #48 made it merge-blocking)
# E2E_REQUIRE_LIVE=0 on pull_request runs because PRs do not have staging creds
# wired; without this block the script would hard-fail at the first admin-auth
# call and red-X every PR (a false-red, not a real regression). The PR-mode gate
# is a self-check: bash -n on the script's own syntax (catches PR-merge
# regressions that would break the real run on push-to-main). On push / dispatch
# / cron, E2E_REQUIRE_LIVE=1 and the real staging boot runs and HARD FAILs
# (exit 5 via require_live_or_die) on a run that validated no live lifecycle.
if [ "${REQUIRE_LIVE}" = "0" ] && [ -z "${ADMIN_TOKEN}" ]; then
log "PR-mode: E2E_REQUIRE_LIVE=0 and no MOLECULE_ADMIN_TOKEN — skipping live staging boot."
log "(the real staging boot runs on push-to-main / dispatch / cron with E2E_REQUIRE_LIVE=1)"
if ! bash -n "$0"; then
fail "PR-mode self-check FAILED: bash -n on $0 returned non-zero — script has a syntax error"
fi
ok "PR-mode self-check PASSED: $(basename "$0") is bash-clean (real staging boot runs on push-to-main with E2E_REQUIRE_LIVE=1)"
exit 0
fi
# Beyond here we are running for real: REQUIRE_LIVE=1 OR ADMIN_TOKEN is set.
# A real run with no admin token is a HARD FAIL (was the `:?` default before #48).
if [ -z "${ADMIN_TOKEN}" ]; then
fail "MOLECULE_ADMIN_TOKEN required (Railway staging CP_ADMIN_API_TOKEN) — a non-PR run (E2E_REQUIRE_LIVE=${REQUIRE_LIVE}) needs staging creds"
fi
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
# without booting the full 11-step lifecycle.