From 8f1d24f33f8945d7760eead046ee5c9552ac613d Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Mon, 11 May 2026 04:33:56 -0700 Subject: [PATCH] fix(ci): canonicalize MOLECULE_STAGING_ADMIN_TOKEN -> CP_STAGING_ADMIN_API_TOKEN (post-#443 rebase) + drop staging-smoke continue-on-error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-applies PR#462 on current main (PR#443 merged first and renamed canary-staging.yml -> staging-smoke.yml, conflicting #462). Swept 6 files (15 secret-ref flips): - .gitea/workflows/staging-smoke.yml (3 refs + drop continue-on-error + add notify-on-failure step) - .gitea/workflows/e2e-staging-saas.yml (3 refs) - .gitea/workflows/e2e-staging-sanity.yml (3 refs) - .gitea/workflows/e2e-staging-canvas.yml (3 refs) - .gitea/workflows/e2e-staging-external.yml (3 refs) - tests/e2e/STAGING_SAAS_E2E.md (1 heading flip + 1 historical-rename breadcrumb) Each workflow keeps one inline breadcrumb comment pointing back to the old name and internal#322. staging-smoke is the 30-min canary cadence for the entire staging SaaS stack; silent failure (continue-on-error: true) masked exactly the regressions the smoke exists to surface, same class as PR#461 (`sweep-stale-e2e-orgs`). Dropped continue-on-error from the smoke job + added a fail-loud `if: failure()` Notify step mirroring PR#461. The four other `e2e-staging-*` workflows KEEP continue-on-error: true per RFC #219 §1 — they are advisory. Excluded from this PR: - .gitea/workflows/sweep-stale-e2e-orgs.yml (PR#461 owns) - .gitea/workflows/staging-verify.yml (only references the plural MOLECULE_STAGING_ADMIN_TOKENS canary-fleet secret, out of scope) - scripts/staging-smoke.sh (same — plural only) - docs/architecture/canary-release.md (same — plural only) - .github/ mirror tree (separate scope per reference_molecule_core_actions_gitea_only) Verified locally: yaml.safe_load clean on all 5 workflows; grep returns ZERO non-breadcrumb references in the swept files; the plural MOLECULE_STAGING_ADMIN_TOKENS references in staging-verify.yml / scripts/staging-smoke.sh / canary-release.md are intentionally untouched. Refs: internal#322, PR#461, feedback_rename_pr_and_edit_pr_conflict_sequence --- .gitea/workflows/e2e-staging-canvas.yml | 9 ++++-- .gitea/workflows/e2e-staging-external.yml | 9 ++++-- .gitea/workflows/e2e-staging-saas.yml | 9 ++++-- .gitea/workflows/e2e-staging-sanity.yml | 9 ++++-- .gitea/workflows/staging-smoke.yml | 38 ++++++++++++++++++++--- tests/e2e/STAGING_SAAS_E2E.md | 10 +++++- 6 files changed, 66 insertions(+), 18 deletions(-) diff --git a/.gitea/workflows/e2e-staging-canvas.yml b/.gitea/workflows/e2e-staging-canvas.yml index 93eb685e..9b4f1475 100644 --- a/.gitea/workflows/e2e-staging-canvas.yml +++ b/.gitea/workflows/e2e-staging-canvas.yml @@ -124,7 +124,10 @@ jobs: env: CANVAS_E2E_STAGING: '1' MOLECULE_CP_URL: https://staging-api.moleculesai.app - MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN + # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per + # internal#322 — see this PR for the cross-workflow sweep. + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} defaults: run: @@ -145,7 +148,7 @@ jobs: if: needs.detect-changes.outputs.canvas == 'true' run: | if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then - echo "::error::Missing MOLECULE_STAGING_ADMIN_TOKEN" + echo "::error::Missing CP_STAGING_ADMIN_API_TOKEN" exit 2 fi @@ -207,7 +210,7 @@ jobs: - name: Teardown safety net if: always() && needs.detect-changes.outputs.canvas == 'true' env: - ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} run: | set +e STATE_FILE=".playwright-staging-state.json" diff --git a/.gitea/workflows/e2e-staging-external.yml b/.gitea/workflows/e2e-staging-external.yml index 7479d8da..6c4e4b91 100644 --- a/.gitea/workflows/e2e-staging-external.yml +++ b/.gitea/workflows/e2e-staging-external.yml @@ -89,7 +89,10 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app - MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN + # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per + # internal#322 — see this PR for the cross-workflow sweep. + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }} E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }} @@ -104,7 +107,7 @@ jobs: # missing — silent skip would mask infra rot. Manual dispatch # gets the same hard-fail; an operator running this on a fork # without secrets configured needs to know up-front. - echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" + echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" exit 2 fi echo "Admin token present ✓" @@ -129,7 +132,7 @@ jobs: - name: Teardown safety net (runs on cancel/failure) if: always() env: - ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} run: | set +e orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \ diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index a1e8911b..bfc83b82 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -86,7 +86,10 @@ jobs: # Single admin-bearer secret drives provision + tenant-token # retrieval + teardown. Configure in # Settings → Secrets and variables → Actions → Repository secrets. - MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN + # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per + # internal#322 — see this PR for the cross-workflow sweep. + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} # MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched # from hermes+OpenAI default after #2578 (the staging OpenAI key # account went over quota and stayed dead for 36+ hours, taking @@ -122,7 +125,7 @@ jobs: - name: Verify admin token present run: | if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then - echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" + echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" exit 2 fi echo "Admin token present ✓" @@ -189,7 +192,7 @@ jobs: - name: Teardown safety net (runs on cancel/failure) if: always() env: - ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} run: | # Best-effort: find any e2e-YYYYMMDD-* orgs matching this run and # nuke them. Catches the case where the script died before diff --git a/.gitea/workflows/e2e-staging-sanity.yml b/.gitea/workflows/e2e-staging-sanity.yml index b1a9ddfe..bf878a88 100644 --- a/.gitea/workflows/e2e-staging-sanity.yml +++ b/.gitea/workflows/e2e-staging-sanity.yml @@ -42,7 +42,10 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app - MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN + # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per + # internal#322 — see this PR for the cross-workflow sweep. + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} E2E_MODE: smoke E2E_RUNTIME: hermes E2E_RUN_ID: "sanity-${{ github.run_id }}" @@ -54,7 +57,7 @@ jobs: - name: Verify admin token present run: | if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then - echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set" + echo "::error::CP_STAGING_ADMIN_API_TOKEN not set" exit 2 fi @@ -118,7 +121,7 @@ jobs: - name: Teardown safety net if: always() env: - ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} run: | set +e orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \ diff --git a/.gitea/workflows/staging-smoke.yml b/.gitea/workflows/staging-smoke.yml index 4a7972d8..623c47ff 100644 --- a/.gitea/workflows/staging-smoke.yml +++ b/.gitea/workflows/staging-smoke.yml @@ -52,8 +52,20 @@ jobs: smoke: name: Staging SaaS smoke runs-on: ubuntu-latest - # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed + # 2026-05-11. The "surface broken workflows without blocking" + # rationale was correctly applied to advisory/lint workflows but + # wrong for this smoke — it is the 30-min canary cadence for the + # entire staging SaaS stack, and silent failure here masks the + # exact regressions the smoke exists to surface (AMI rot, CF cert + # drift, WorkOS session breakage, secret rotations). Same class of + # failure as PR#461 (`sweep-stale-e2e-orgs`) where Phase-3 silent + # failure leaked EC2. The four other `e2e-staging-*` workflows + # KEEP `continue-on-error: true` per RFC #219 §1 — they are + # advisory and matrix-style; this one is the canary. A follow-up + # `notify-failure` step below also surfaces breakage to ops even + # if branch-protection wiring is adjusted to keep this off the + # required-checks list. # 25 min headroom over the 15-min TLS-readiness deadline in # tests/e2e/test_staging_full_saas.sh (#2107). Without the buffer # the job is killed at the wall-clock 15:00 mark BEFORE the bash @@ -65,7 +77,10 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app - MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN + # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per + # internal#322 — see this PR for the cross-workflow sweep. + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} # MiniMax is the smoke's PRIMARY LLM auth path post-2026-05-04. # Switched from hermes+OpenAI after #2578 (the staging OpenAI key # account went over quota and stayed dead for 36+ hours, taking @@ -111,7 +126,7 @@ jobs: - name: Verify admin token present run: | if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then - echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set" + echo "::error::CP_STAGING_ADMIN_API_TOKEN not set" exit 2 fi @@ -241,7 +256,7 @@ jobs: - name: Teardown safety net if: always() env: - ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} run: | set +e # Slug prefix matches what test_staging_full_saas.sh emits @@ -316,3 +331,16 @@ jobs: echo "::warning::smoke teardown left ${#leaks[@]} leak(s): ${leaks[*]}" fi exit 0 + + - name: Notify on smoke failure + # Fail-loud companion to dropping `continue-on-error: true`. + # The Open-issue-on-failure step above handles the human-facing + # alert; this step emits a clearly-tagged ::error:: line that + # log-tail consumers (Loki SOPRefireRule, orchestrator triage + # loop) can grep on. Mirrors PR#461's sweep-stale-e2e-orgs + # pattern. Runs AFTER the teardown safety net (which is + # if: always()) so failures don't suppress cleanup. + if: failure() + run: | + echo "::error::staging-smoke FAILED — staging SaaS canary is red. See prior step logs + the auto-filed alert issue. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) MiniMax/Anthropic LLM key dead, (d) AMI/CF/WorkOS drift. The 30-min cron will retry, but a chronic red here indicates the staging SaaS stack is broken end-to-end." + exit 1 diff --git a/tests/e2e/STAGING_SAAS_E2E.md b/tests/e2e/STAGING_SAAS_E2E.md index b31a7cec..cbfc1f10 100644 --- a/tests/e2e/STAGING_SAAS_E2E.md +++ b/tests/e2e/STAGING_SAAS_E2E.md @@ -49,7 +49,15 @@ Runs the harness with `E2E_INTENTIONAL_FAILURE=1`, which poisons the tenant admi Set in **Settings → Secrets and variables → Actions → Repository secrets**: -### `MOLECULE_STAGING_ADMIN_TOKEN` +### `CP_STAGING_ADMIN_API_TOKEN` + +> **Historical-rename note (2026-05-11):** previously named +> `MOLECULE_STAGING_ADMIN_TOKEN`. Canonicalised to +> `CP_STAGING_ADMIN_API_TOKEN` per internal#322 (the Railway staging +> service exposes it as `CP_ADMIN_API_TOKEN`; the `CP_*` repo-secret +> prefix matches the upstream env name + makes the service it talks +> to obvious in workflow YAMLs). See the original PR for the +> cross-workflow sweep. The `CP_ADMIN_API_TOKEN` env currently set on the Railway staging molecule-platform → controlplane service.