ci(e2e-staging): promote E2E Staging Platform Boot to merge-blocking (fail-closed) — #48 #3116
@@ -14,3 +14,10 @@ E2E Peer Visibility (literal MCP list_peers) / E2E Peer Visibility
|
||||
Secret scan / Scan diff for credential-shaped strings
|
||||
template-delivery-e2e / Template-asset delivery (fresh seo-agent — config+prompts via asset channel, seo-all via plugin reconcile)
|
||||
E2E Staging SaaS (full lifecycle) / E2E Staging Concierge Creates Workspace
|
||||
# #48 (RCA molecule-controlplane #878→#885): real platform-managed boot is now
|
||||
# merge-blocking. SSOT updated here in the same PR that removed continue-on-error
|
||||
# (lint-no-coe-on-required forbids CoE on any listed context). REMAINING OWNER
|
||||
# ACTION: add "E2E Staging SaaS (full lifecycle) / E2E Staging Platform Boot
|
||||
# (pull_request)" to branch_protections/main.status_check_contexts AFTER this PR
|
||||
# merges (allowlist-superset-of-BP is lint-clean; BP-superset-of-allowlist is not).
|
||||
E2E Staging SaaS (full lifecycle) / E2E Staging Platform Boot
|
||||
|
||||
@@ -369,22 +369,32 @@ jobs:
|
||||
# honest (BYOK requires a key; platform requires its ABSENCE not to matter) and
|
||||
# gives the regression its own named commit-status for branch protection.
|
||||
#
|
||||
# Add `E2E Staging Platform Boot` to branch protection after 3 consecutive
|
||||
# green runs on main (de-flake window; this path shares the cp#245
|
||||
# boot-timeout flake surface the BYOK job has, so it must prove stable before
|
||||
# it can BLOCK — see the gate-making plan in the PR body).
|
||||
# bp-required: pending #2187
|
||||
# GATING (no continue-on-error), FALSE-GREEN-PROOF via E2E_REQUIRE_LIVE
|
||||
# (0 on pull_request → PR-mode self-check; 1 on push/dispatch/cron → real
|
||||
# staging boot, HARD FAILs on missing infra). Promoted to merge-blocking
|
||||
# per #48 — RCA: molecule-controlplane PR #878 rendered the tenant
|
||||
# `docker run` env block with a blank line that broke shell `\`-continuation,
|
||||
# orphaning the image arg → `docker run exit=127` → no tenant container →
|
||||
# prod onboarding outage 06:04–08:09 UTC 2026-06-21 (fixed by CP #885). The
|
||||
# real-boot e2e being advisory + PR-skipped is why that class escaped
|
||||
# pre-merge. This job now exercises a real platform-managed boot on every
|
||||
# push-to-main and is the merge gate.
|
||||
#
|
||||
# bp-required: now required — added to .gitea/required-contexts.txt by #48
|
||||
#
|
||||
# core#3081 / #48: NO `if:` guard on this job (mirrors
|
||||
# e2e-staging-concierge-creates-workspace). The job IS a required status
|
||||
# context (see .gitea/required-contexts.txt); a required context that never
|
||||
# fires on pull_request degrades the merge gate to a silent indefinite
|
||||
# pending (the exact failure mode lint-required-no-paths exists to prevent;
|
||||
# see feedback_path_filtered_workflow_cant_be_required). The job runs on
|
||||
# every PR with E2E_REQUIRE_LIVE=0 (the harness detects the missing-creds
|
||||
# case and exit 0s after a bash -n self-check), and on push/dispatch/cron
|
||||
# with E2E_REQUIRE_LIVE=1 (the real staging boot runs and HARD FAILs on
|
||||
# missing infra).
|
||||
e2e-staging-platform-boot:
|
||||
name: E2E Staging Platform Boot
|
||||
runs-on: ubuntu-latest
|
||||
# core#3081: gate the slow platform-boot job to push/dispatch/cron now
|
||||
# that the workflow's `paths:` filter has been removed (lint-required-no-paths
|
||||
# compliance). Matches the pattern of the other slow jobs in this workflow.
|
||||
if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
|
||||
# Phase 3 (RFC #219 §1): surface without blocking until the de-flake window
|
||||
# closes. mc#2654: do NOT renew this mask silently — the gate-making plan
|
||||
# tracks the flip to false under #2187.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 45
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -410,17 +420,33 @@ jobs:
|
||||
E2E_MODE: smoke
|
||||
E2E_RUN_ID: "platform-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
# Fail-closed-on-skip (see BYOK job). smoke mode still runs steps 2/4/7/8b,
|
||||
# so all four required milestones (provisioned/tenant_online/
|
||||
# workspace_online/a2a_roundtrip) fire — the guard is valid for this lane too.
|
||||
E2E_REQUIRE_LIVE: '1'
|
||||
# Fail-closed-on-skip (see BYOK job + e2e-staging-concierge-creates-workspace).
|
||||
# smoke mode still runs steps 2/4/7/8b, so all four required milestones
|
||||
# (provisioned/tenant_online/workspace_online/a2a_roundtrip) fire — the guard
|
||||
# is valid for this lane too.
|
||||
# pull_request: 0 → PRs have no staging creds; the harness's PR-mode
|
||||
# self-check (bash -n) is the gate, then exit 0.
|
||||
# push / dispatch / schedule: 1 → the real staging boot runs and HARD
|
||||
# FAILs (exit 5) on a run that proves no live lifecycle.
|
||||
E2E_REQUIRE_LIVE: ${{ github.event_name == 'pull_request' && '0' || '1' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify admin token present
|
||||
env:
|
||||
E2E_REQUIRE_LIVE: ${{ github.event_name == 'pull_request' && '0' || '1' }}
|
||||
run: |
|
||||
# PR-mode (#48): on pull_request the job runs with E2E_REQUIRE_LIVE=0
|
||||
# and PRs carry no staging creds. Don't hard-fail here — the harness
|
||||
# detects the missing-creds case, runs a bash -n self-check, and
|
||||
# exit 0s. On push/dispatch/cron (E2E_REQUIRE_LIVE=1) the creds MUST
|
||||
# be present and a missing token/AWS-cred is a hard error.
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
if [ "${E2E_REQUIRE_LIVE}" = "0" ]; then
|
||||
echo "PR-mode: no MOLECULE_ADMIN_TOKEN (E2E_REQUIRE_LIVE=0) — harness will self-check and skip the live boot ✓"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
@@ -123,7 +123,11 @@
|
||||
set -euo pipefail
|
||||
|
||||
CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
|
||||
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
|
||||
# #48: tolerate an absent admin token here — the PR-mode early-exit below
|
||||
# (E2E_REQUIRE_LIVE=0 + no token) handles the pull_request lane cleanly. On a
|
||||
# real run (push/dispatch/cron, E2E_REQUIRE_LIVE=1) the missing-token case is
|
||||
# caught as a HARD FAIL just past the PR-mode block, with a clear message.
|
||||
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:-}"
|
||||
RUNTIME="${E2E_RUNTIME:-hermes}"
|
||||
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
|
||||
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}"
|
||||
@@ -216,6 +220,32 @@ require_live_or_die() {
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── PR-mode early-exit (#48 — mirrors test_staging_concierge_creates_workspace_e2e.sh) ──
|
||||
# This harness is invoked by TWO jobs in e2e-staging-saas.yml:
|
||||
# - e2e-staging-saas (push/dispatch/cron only; always has creds + REQUIRE_LIVE=1)
|
||||
# - e2e-staging-platform-boot (now ALSO pull_request; #48 made it merge-blocking)
|
||||
# E2E_REQUIRE_LIVE=0 on pull_request runs because PRs do not have staging creds
|
||||
# wired; without this block the script would hard-fail at the first admin-auth
|
||||
# call and red-X every PR (a false-red, not a real regression). The PR-mode gate
|
||||
# is a self-check: bash -n on the script's own syntax (catches PR-merge
|
||||
# regressions that would break the real run on push-to-main). On push / dispatch
|
||||
# / cron, E2E_REQUIRE_LIVE=1 and the real staging boot runs and HARD FAILs
|
||||
# (exit 5 via require_live_or_die) on a run that validated no live lifecycle.
|
||||
if [ "${REQUIRE_LIVE}" = "0" ] && [ -z "${ADMIN_TOKEN}" ]; then
|
||||
log "PR-mode: E2E_REQUIRE_LIVE=0 and no MOLECULE_ADMIN_TOKEN — skipping live staging boot."
|
||||
log "(the real staging boot runs on push-to-main / dispatch / cron with E2E_REQUIRE_LIVE=1)"
|
||||
if ! bash -n "$0"; then
|
||||
fail "PR-mode self-check FAILED: bash -n on $0 returned non-zero — script has a syntax error"
|
||||
fi
|
||||
ok "PR-mode self-check PASSED: $(basename "$0") is bash-clean (real staging boot runs on push-to-main with E2E_REQUIRE_LIVE=1)"
|
||||
exit 0
|
||||
fi
|
||||
# Beyond here we are running for real: REQUIRE_LIVE=1 OR ADMIN_TOKEN is set.
|
||||
# A real run with no admin token is a HARD FAIL (was the `:?` default before #48).
|
||||
if [ -z "${ADMIN_TOKEN}" ]; then
|
||||
fail "MOLECULE_ADMIN_TOKEN required (Railway staging CP_ADMIN_API_TOKEN) — a non-PR run (E2E_REQUIRE_LIVE=${REQUIRE_LIVE}) needs staging creds"
|
||||
fi
|
||||
|
||||
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
|
||||
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
|
||||
# without booting the full 11-step lifecycle.
|
||||
|
||||
Reference in New Issue
Block a user