From ccd3d7c0720b814663cabff8572e0e931bd1d22f Mon Sep 17 00:00:00 2001 From: hongming-codex-laptop Date: Tue, 12 May 2026 16:10:53 -0700 Subject: [PATCH] fix(ci): repair scheduled main janitors and remove masks --- .gitea/workflows/block-internal-paths.yml | 4 +-- .gitea/workflows/cascade-list-drift-gate.yml | 4 +-- .../workflows/check-migration-collisions.yml | 4 +-- .gitea/workflows/ci.yml | 20 +++++------ .gitea/workflows/continuous-synth-e2e.yml | 4 +-- .gitea/workflows/e2e-api.yml | 6 ++-- .gitea/workflows/e2e-staging-canvas.yml | 6 ++-- .gitea/workflows/e2e-staging-external.yml | 4 +-- .gitea/workflows/e2e-staging-saas.yml | 12 +++---- .gitea/workflows/e2e-staging-sanity.yml | 4 +-- .gitea/workflows/gate-check-v3.yml | 35 +++++++++++-------- .../handlers-postgres-integration.yml | 6 ++-- .gitea/workflows/harness-replays.yml | 6 ++-- .../lint-continue-on-error-tracking.yml | 14 ++++---- .gitea/workflows/lint-curl-status-capture.yml | 4 +-- .gitea/workflows/lint-mask-pr-atomicity.yml | 6 ++-- .../lint-pre-flip-continue-on-error.yml | 12 +++---- .gitea/workflows/lint-workflow-yaml.yml | 4 +-- .gitea/workflows/publish-canvas-image.yml | 4 +-- .gitea/workflows/publish-runtime-autobump.yml | 6 ++-- .gitea/workflows/railway-pin-audit.yml | 4 +-- .gitea/workflows/redeploy-tenants-on-main.yml | 4 +-- .../workflows/redeploy-tenants-on-staging.yml | 4 +-- .gitea/workflows/review-check-tests.yml | 2 +- .gitea/workflows/runtime-pin-compat.yml | 4 +-- .gitea/workflows/runtime-prbuild-compat.yml | 6 ++-- .gitea/workflows/secret-pattern-drift.yml | 4 +-- .gitea/workflows/sop-tier-check.yml | 16 ++++----- .gitea/workflows/staging-smoke.yml | 8 ++--- .gitea/workflows/staging-verify.yml | 6 ++-- .gitea/workflows/sweep-aws-secrets.yml | 22 +++++------- .gitea/workflows/sweep-cf-orphans.yml | 4 +-- .gitea/workflows/sweep-cf-tunnels.yml | 4 +-- .gitea/workflows/sweep-stale-e2e-orgs.yml | 6 ++-- .gitea/workflows/test-ops-scripts.yml | 4 +-- .gitea/workflows/weekly-platform-go.yml | 4 +-- scripts/ops/sweep-aws-secrets.sh | 10 +++--- scripts/ops/sweep-cf-tunnels.sh | 10 +++--- 38 files changed, 145 insertions(+), 142 deletions(-) diff --git a/.gitea/workflows/block-internal-paths.yml b/.gitea/workflows/block-internal-paths.yml index ed60e7e4..fb02c6c9 100644 --- a/.gitea/workflows/block-internal-paths.yml +++ b/.gitea/workflows/block-internal-paths.yml @@ -8,7 +8,7 @@ name: Block internal-flavored paths # merge queue; no `gh-readonly-queue/...` refs). # - Workflow-level env.GITHUB_SERVER_URL set per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on the job (RFC §1 contract — surface +# - `continue-on-error: false` on the job (RFC §1 contract — surface # defects without blocking; follow-up PR flips after triage). # # Hard CI gate. Internal content (positioning, competitive briefs, sales @@ -37,7 +37,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. - continue-on-error: true + continue-on-error: false steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: diff --git a/.gitea/workflows/cascade-list-drift-gate.yml b/.gitea/workflows/cascade-list-drift-gate.yml index 99b8e8bb..26a05cd5 100644 --- a/.gitea/workflows/cascade-list-drift-gate.yml +++ b/.gitea/workflows/cascade-list-drift-gate.yml @@ -12,7 +12,7 @@ name: cascade-list-drift-gate # will not exist post-Cat-A). # - Workflow-level env.GITHUB_SERVER_URL set per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on the job (RFC §1 contract — surface +# - `continue-on-error: false` on the job (RFC §1 contract — surface # defects without blocking; follow-up PR flips after triage). # # Structural gate: TEMPLATES list in publish-runtime.yml must match @@ -48,7 +48,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. - continue-on-error: true + continue-on-error: false steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Check cascade list matches manifest diff --git a/.gitea/workflows/check-migration-collisions.yml b/.gitea/workflows/check-migration-collisions.yml index e2aed7f5..2021250e 100644 --- a/.gitea/workflows/check-migration-collisions.yml +++ b/.gitea/workflows/check-migration-collisions.yml @@ -9,7 +9,7 @@ name: Check migration collisions # - Workflow-level env.GITHUB_SERVER_URL pinned to https://git.moleculesai.app # so scripts/ops/check_migration_collisions.py can derive the Gitea API # base (the script already supports this; see _gitea_api_url()). -# - `continue-on-error: true` on the job (RFC §1 contract). +# - `continue-on-error: false` on the job (RFC §1 contract). # # Hard gate (#2341): fails a PR that adds a migration prefix already # claimed by the base branch or another open PR. Caught manually 2026-04-30 @@ -45,7 +45,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. - continue-on-error: true + continue-on-error: false timeout-minutes: 5 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 52f65a3b..203ee793 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -1,5 +1,5 @@ # Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1. -# continue-on-error: true on every job; follow-up PR will flip required after +# continue-on-error: false on every job; follow-up PR will flip required after # surfaced bugs are fixed (per RFC §1 — "surface broken workflows without # blocking"). The four-surface migration audit # (feedback_gitea_actions_migration_audit_pattern) was performed against this @@ -74,7 +74,7 @@ jobs: # Flip confirmed 2026-05-12 via combined-status check of latest main # commit (all CI jobs green). `all-required` sentinel hard-fails # when this job fails; no Phase 3 suppression needed. - # revert: add `continue-on-error: true` back if regressions appear. + # revert: add `continue-on-error: false` back if regressions appear. continue-on-error: false outputs: platform: ${{ steps.check.outputs.platform }} @@ -128,7 +128,7 @@ jobs: runs-on: ubuntu-latest # mc#664 (interim): re-mask platform-build pending fix-forward. Phase 4 # (#656) flipped this to continue-on-error: false based on a Phase-3-masked - # "green on main 2026-05-12" — the prior continue-on-error: true had + # "green on main 2026-05-12" — the prior continue-on-error: false had # been hiding failing tests in workspace-server/internal/handlers/. # Two distinct failure classes surfaced on 0e5152c3: # (1) 4x delegation_test.go (lines 1110/1176/1228/1271): helpers @@ -148,7 +148,7 @@ jobs: # a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing. # Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint) # retain continue-on-error: false; only platform-build regresses. - continue-on-error: true # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709) + continue-on-error: false # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709) defaults: run: working-directory: workspace-server @@ -186,7 +186,7 @@ jobs: echo "::group::pendinguploads exit=$pu_exit (last 100 lines)" tail -100 /tmp/test-pu.log echo "::endgroup::" - continue-on-error: true + continue-on-error: false - if: needs.changes.outputs.platform == 'true' name: Run tests with race detection and coverage run: go test -race -coverprofile=coverage.out ./... @@ -372,7 +372,7 @@ jobs: canvas-deploy-reminder: name: Canvas Deploy Reminder runs-on: ubuntu-latest - continue-on-error: true + continue-on-error: false needs: [changes, canvas-build] # Only fires on direct pushes to main (i.e. after staging→main promotion). if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main' @@ -536,14 +536,14 @@ jobs: # `.gitea/scripts/ci-required-drift.py::ci_job_names`). # # Phase 3 (RFC #219 §1) safety: underlying build jobs carry - # continue-on-error: true so their failures are masked to null (2026-05-12: re-enabled mc#664 interim) + # continue-on-error: false so their failures are masked to null (2026-05-12: re-enabled mc#664 interim) # (Gitea suppresses status reporting for CoE jobs). This sentinel # runs with continue-on-error: false so it always reports its # result to the API — without this, the required-status entry # (CI / all-required (pull_request)) is never created, which # blocks PR merges. When Phase 3 ends, flip underlying jobs to # continue-on-error: false; this sentinel can then be flipped to - # continue-on-error: true if a Phase-4 regression requires it. + # continue-on-error: false if a Phase-4 regression requires it. continue-on-error: false runs-on: ubuntu-latest timeout-minutes: 1 @@ -560,7 +560,7 @@ jobs: set -euo pipefail # `needs.*.result` is one of: success | failure | cancelled | skipped | null. # We assert success per dep (not != failure) — see RFC §2 reasoning above. - # Null results are skipped: they come from Phase 3 (continue-on-error: true + # Null results are skipped: they come from Phase 3 (continue-on-error: false # suppresses status) or from jobs still in-flight. The sentinel succeeds # rather than blocking PRs on Phase 3 noise. results='${{ toJSON(needs) }}' @@ -568,7 +568,7 @@ jobs: echo "$results" | python3 -c ' import json, sys ns = json.load(sys.stdin) - # Phase 3 masked: jobs with continue-on-error: true may report "failure" + # Phase 3 masked: jobs with continue-on-error: false may report "failure" # Remove when mc#664 handler test failures are resolved. PHASE3_MASKED = {"platform-build"} # Exclude null (Phase 3 suppressed / in-flight) from the bad list. diff --git a/.gitea/workflows/continuous-synth-e2e.yml b/.gitea/workflows/continuous-synth-e2e.yml index 6b3c72b6..ed1add21 100644 --- a/.gitea/workflows/continuous-synth-e2e.yml +++ b/.gitea/workflows/continuous-synth-e2e.yml @@ -8,7 +8,7 @@ name: Continuous synthetic E2E (staging) # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Hard gate (#2342): cron-driven full-lifecycle E2E that catches @@ -90,7 +90,7 @@ jobs: name: Synthetic E2E against staging runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase # (apt-get update + install docker.io/jq/awscli/caddy + snap install # ssm-agent) runs from raw Ubuntu on every boot — none of it is diff --git a/.gitea/workflows/e2e-api.yml b/.gitea/workflows/e2e-api.yml index 6f82e080..3a12a746 100644 --- a/.gitea/workflows/e2e-api.yml +++ b/.gitea/workflows/e2e-api.yml @@ -8,7 +8,7 @@ name: E2E API Smoke Test # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Extracted from ci.yml so workflow-level concurrency can protect this job # from run-level cancellation (issue #458). @@ -103,7 +103,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false outputs: api: ${{ steps.decide.outputs.api }} steps: @@ -154,7 +154,7 @@ jobs: name: E2E API Smoke Test runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 15 env: # Unique per-run container names so concurrent runs on the host- diff --git a/.gitea/workflows/e2e-staging-canvas.yml b/.gitea/workflows/e2e-staging-canvas.yml index 9b4f1475..9348e56c 100644 --- a/.gitea/workflows/e2e-staging-canvas.yml +++ b/.gitea/workflows/e2e-staging-canvas.yml @@ -8,7 +8,7 @@ name: E2E Staging Canvas (Playwright) # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Playwright test suite that provisions a fresh staging org per run and @@ -70,7 +70,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false outputs: canvas: ${{ steps.decide.outputs.canvas }} steps: @@ -118,7 +118,7 @@ jobs: name: Canvas tabs E2E runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 40 env: diff --git a/.gitea/workflows/e2e-staging-external.yml b/.gitea/workflows/e2e-staging-external.yml index 6c4e4b91..5d06d65e 100644 --- a/.gitea/workflows/e2e-staging-external.yml +++ b/.gitea/workflows/e2e-staging-external.yml @@ -8,7 +8,7 @@ name: E2E Staging External Runtime # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Regression for the four/five workspaces.status=awaiting_agent transitions @@ -84,7 +84,7 @@ jobs: name: E2E Staging External Runtime runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 25 env: diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index 306e561d..d05911e0 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -8,7 +8,7 @@ name: E2E Staging SaaS (full lifecycle) # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Dedicated workflow that provisions a fresh staging org per run, exercises @@ -81,25 +81,25 @@ jobs: # PR-validation path: always posts success so branch protection can merge # workflow-only PRs. The actual E2E step only runs when provisioning- # critical files change (git-paths filter + if: guard below). - # All steps use continue-on-error: true so runner issues do not block merge. + # All steps use continue-on-error: false so runner issues do not block merge. pr-validate: runs-on: ubuntu-latest steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 1 - continue-on-error: true + continue-on-error: false - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.11" - continue-on-error: true + continue-on-error: false - name: YAML validation (best-effort) run: | echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid." echo "E2E step runs only when provisioning-critical files change." - continue-on-error: true + continue-on-error: false # Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only # path — pr-validate above posts success for workflow-only PRs. @@ -109,7 +109,7 @@ jobs: # Only runs on trunk pushes. PR paths get pr-validate instead. if: github.event.pull_request.base.ref == '' # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 45 permissions: contents: read diff --git a/.gitea/workflows/e2e-staging-sanity.yml b/.gitea/workflows/e2e-staging-sanity.yml index bf878a88..f02c7824 100644 --- a/.gitea/workflows/e2e-staging-sanity.yml +++ b/.gitea/workflows/e2e-staging-sanity.yml @@ -8,7 +8,7 @@ name: E2E Staging Sanity (leak-detection self-check) # - `actions/github-script@v9` issue-open block replaced with curl # calls to the Gitea REST API (/api/v1/repos/.../issues|comments). # - Workflow-level env.GITHUB_SERVER_URL set. -# - `continue-on-error: true` on the job (RFC §1 contract). +# - `continue-on-error: false` on the job (RFC §1 contract). # # Periodic assertion that the teardown safety nets in e2e-staging-saas # and staging-smoke (formerly canary-staging) actually work. Runs the @@ -37,7 +37,7 @@ jobs: name: Intentional-failure teardown sanity runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 20 env: diff --git a/.gitea/workflows/gate-check-v3.yml b/.gitea/workflows/gate-check-v3.yml index aaa37153..58dca37a 100644 --- a/.gitea/workflows/gate-check-v3.yml +++ b/.gitea/workflows/gate-check-v3.yml @@ -46,7 +46,7 @@ env: jobs: gate-check: runs-on: ubuntu-latest - continue-on-error: true # Never block on our own detector failing + continue-on-error: false # Never block on our own detector failing steps: - name: Check out BASE ref (never PR-head under pull_request_target) # pull_request_target runs with repo secrets-context, so checking out @@ -76,25 +76,32 @@ jobs: if: github.event_name == 'schedule' env: GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} run: | set -euo pipefail # Fetch all open PRs and run gate-check on each # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN. # gate_check.py uses timeout=15 on every urlopen call; this catches the # inline Python polling loop too (issue #603). - pr_numbers=$(python3 -c " - import socket, urllib.request, json, os - socket.setdefaulttimeout(15) - token = os.environ['GITEA_TOKEN'] - req = urllib.request.Request( - 'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100', - headers={'Authorization': f'token {token}', 'Accept': 'application/json'} - ) - with urllib.request.urlopen(req) as r: - prs = json.loads(r.read()) - for pr in prs: - print(pr['number']) - ") + pr_numbers=$(python3 <<'PY' + import json + import os + import socket + import urllib.request + + socket.setdefaulttimeout(15) + token = os.environ["GITEA_TOKEN"] + repo = os.environ["REPO"] + req = urllib.request.Request( + f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100", + headers={"Authorization": f"token {token}", "Accept": "application/json"}, + ) + with urllib.request.urlopen(req) as r: + prs = json.loads(r.read()) + for pr in prs: + print(pr["number"]) + PY + ) for pr in $pr_numbers; do echo "Checking PR #$pr..." python3 tools/gate-check-v3/gate_check.py \ diff --git a/.gitea/workflows/handlers-postgres-integration.yml b/.gitea/workflows/handlers-postgres-integration.yml index fcebdde1..6f9f9a67 100644 --- a/.gitea/workflows/handlers-postgres-integration.yml +++ b/.gitea/workflows/handlers-postgres-integration.yml @@ -8,7 +8,7 @@ name: Handlers Postgres Integration # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Real-Postgres integration tests for workspace-server/internal/handlers/. @@ -79,7 +79,7 @@ jobs: name: detect-changes runs-on: ubuntu-latest # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false outputs: handlers: ${{ steps.filter.outputs.handlers }} steps: @@ -119,7 +119,7 @@ jobs: needs: detect-changes runs-on: ubuntu-latest # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false env: # Unique name per run so concurrent jobs don't collide on the # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across diff --git a/.gitea/workflows/harness-replays.yml b/.gitea/workflows/harness-replays.yml index f83d03b1..58dff52a 100644 --- a/.gitea/workflows/harness-replays.yml +++ b/.gitea/workflows/harness-replays.yml @@ -8,7 +8,7 @@ name: Harness Replays # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Boots tests/harness (production-shape compose topology with TenantGuard, @@ -63,7 +63,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false outputs: run: ${{ steps.decide.outputs.run }} steps: @@ -154,7 +154,7 @@ jobs: name: Harness Replays runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 30 steps: - name: No-op pass (paths filter excluded this commit) diff --git a/.gitea/workflows/lint-continue-on-error-tracking.yml b/.gitea/workflows/lint-continue-on-error-tracking.yml index cd3a59a0..dcc8f7c7 100644 --- a/.gitea/workflows/lint-continue-on-error-tracking.yml +++ b/.gitea/workflows/lint-continue-on-error-tracking.yml @@ -1,17 +1,17 @@ name: lint-continue-on-error-tracking # Tier 2e hard-gate lint (per internal#350) — every -# `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a +# `continue-on-error: false` in `.gitea/workflows/*.yml` must carry a # `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines, # the referenced issue must be OPEN, and ≤14 days old. # # Why this exists # --------------- -# `continue-on-error: true` on `platform-build` had been hiding +# `continue-on-error: false` on `platform-build` had been hiding # mc#664-class regressions for ~3 weeks before #656 surfaced them on # 2026-05-12. A 14-day cap on tracker age forces a review cycle and # surfaces mask-drift within at most 14 days of the original defect. -# Each `continue-on-error: true` gets a paper trail — close or renew. +# Each `continue-on-error: false` gets a paper trail — close or renew. # # How the gate works # ------------------ @@ -37,8 +37,8 @@ name: lint-continue-on-error-tracking # # Phase contract (RFC internal#219 §1 ladder) # ------------------------------------------- -# Lands at `continue-on-error: true` (Phase 3 — surface broken shapes -# without blocking). The pre-existing `continue-on-error: true` +# Lands at `continue-on-error: false` (Phase 3 — surface broken shapes +# without blocking). The pre-existing `continue-on-error: false` # directives on `main` will all violate this lint at first # (intentional — they're the masked defects this lint exists to # surface). Each must be triaged: file a fresh tracker comment, @@ -94,10 +94,10 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 10 # Phase 3 (RFC #219 §1): surface masked defects without blocking - # PRs. Pre-existing continue-on-error: true directives on main + # PRs. Pre-existing continue-on-error: false directives on main # all violate this lint at first — intentional. Flip to false # follow-up after main is clean for 3 days. internal#350. - continue-on-error: true # internal#350 Phase 3 mask — 14d forced-renewal cadence + continue-on-error: false # internal#350 Phase 3 mask — 14d forced-renewal cadence steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 diff --git a/.gitea/workflows/lint-curl-status-capture.yml b/.gitea/workflows/lint-curl-status-capture.yml index 99f3f4c0..20c5e04d 100644 --- a/.gitea/workflows/lint-curl-status-capture.yml +++ b/.gitea/workflows/lint-curl-status-capture.yml @@ -11,7 +11,7 @@ name: Lint curl status-code capture # - Dropped `merge_group:` trigger. # - Workflow-level env.GITHUB_SERVER_URL set per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on the job (RFC §1 contract). +# - `continue-on-error: false` on the job (RFC §1 contract). # # Pins the workflow-bash anti-pattern that produced "HTTP 000000" on the # 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6: @@ -45,7 +45,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. - continue-on-error: true + continue-on-error: false steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Find curl ... -w '%{http_code}' ... || echo "000" subshells diff --git a/.gitea/workflows/lint-mask-pr-atomicity.yml b/.gitea/workflows/lint-mask-pr-atomicity.yml index 2aa58388..a75538be 100644 --- a/.gitea/workflows/lint-mask-pr-atomicity.yml +++ b/.gitea/workflows/lint-mask-pr-atomicity.yml @@ -7,7 +7,7 @@ name: lint-mask-pr-atomicity # # Why this exists # --------------- -# PR#665 (interim `continue-on-error: true` on `platform-build`) and +# PR#665 (interim `continue-on-error: false` on `platform-build`) and # PR#668 (sentinel-`needs` demotion of the same job) were designed as a # pair but merged solo — #665 landed at 04:47Z 2026-05-12, #668 was # still open at 05:07Z when the main-red watchdog (#674) fired. Result: @@ -34,7 +34,7 @@ name: lint-mask-pr-atomicity # # Phase contract (RFC internal#219 §1 ladder) # ------------------------------------------- -# This workflow lands at `continue-on-error: true` (Phase 3 — surface +# This workflow lands at `continue-on-error: false` (Phase 3 — surface # regressions without blocking PRs while the rule beds in). # Follow-up PR flips to `false` once we have ≥3 days of clean runs on # `main` and no false-positives. Tracking issue: internal#350. @@ -92,7 +92,7 @@ jobs: # PRs. Follow-up PR flips this to `false` once recent runs on main # are confirmed clean (eat-our-own-dogfood discipline mirrors # PR#673's same-shape comment). Tracking: internal#350. - continue-on-error: true + continue-on-error: false steps: - name: Check out PR head with full history (need base SHA blobs) uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/lint-pre-flip-continue-on-error.yml b/.gitea/workflows/lint-pre-flip-continue-on-error.yml index ae8bfe8c..d2ca1042 100644 --- a/.gitea/workflows/lint-pre-flip-continue-on-error.yml +++ b/.gitea/workflows/lint-pre-flip-continue-on-error.yml @@ -1,15 +1,15 @@ name: Lint pre-flip continue-on-error -# Pre-merge gate: blocks PRs that flip `continue-on-error: true → false` +# Pre-merge gate: blocks PRs that flip `continue-on-error: false → false` # on any job in `.gitea/workflows/*.yml` WITHOUT proof that the affected # job's recent runs on the target branch (PR base) are actually green. # # Empirical class: PR #656 / mc#664. PR #656 (RFC internal#219 Phase 4) -# flipped 5 platform-build-class jobs `continue-on-error: true → false` +# flipped 5 platform-build-class jobs `continue-on-error: false → false` # on the basis of a "verified green on main via combined-status check". -# But that "green" was the LIE the prior `continue-on-error: true` +# But that "green" was the LIE the prior `continue-on-error: false` # produced: Gitea Quirk #10 (internal#342 + dup #287) — a failed step -# inside a `continue-on-error: true` job rolls up to a `success` +# inside a `continue-on-error: false` job rolls up to a `success` # job-level status. The precondition the PR claimed to verify was # structurally fooled by the bug being flipped. # @@ -61,7 +61,7 @@ name: Lint pre-flip continue-on-error # feedback_no_shared_persona_token_use. # # Phase contract (RFC internal#219 §1 ladder): -# - This workflow lands at `continue-on-error: true` (Phase 3 — +# - This workflow lands at `continue-on-error: false` (Phase 3 — # surface defects without blocking). Follow-up PR flips it to # `false` ONLY after this workflow's own recent runs on `main` # are confirmed clean — exactly the discipline the workflow @@ -100,7 +100,7 @@ jobs: # Phase 3 (RFC internal#219 §1): surface broken flips without blocking # the PR yet. Follow-up flips this to `false` once the workflow itself # has clean recent runs on main. mc#664 interim — remove when CoE→false. - continue-on-error: true # mc#664 + continue-on-error: false # mc#664 steps: - name: Check out PR head (full history for base-SHA access) uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/lint-workflow-yaml.yml b/.gitea/workflows/lint-workflow-yaml.yml index 1b2b7120..ef932e00 100644 --- a/.gitea/workflows/lint-workflow-yaml.yml +++ b/.gitea/workflows/lint-workflow-yaml.yml @@ -25,7 +25,7 @@ name: Lint workflow YAML (Gitea-1.22.6-hostile shapes) # - pull_request: pre-merge gate — block hostile shapes before they land # - push: post-merge regression detection — catch direct-to-main edits # -# Per RFC internal#219 §1 contract: continue-on-error: true during the +# Per RFC internal#219 §1 contract: continue-on-error: false during the # surface-broken-shapes phase. Follow-up PR flips off after surfaced # defects are triaged. The push-trigger ensures we catch regressions # even if the pull_request gate is bypassed by branch-protection drift. @@ -55,7 +55,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs. # Follow-up PR flips this off after the 4 existing-on-main rule-2 # (workflow_run) violations are migrated to a supported trigger. - continue-on-error: true + continue-on-error: false steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index 0438c33d..4ca85a80 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -8,7 +8,7 @@ name: publish-canvas-image # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # - **Open question for review**: this workflow pushes the canvas # image to `ghcr.io`. GHCR was retired during the 2026-05-06 # Gitea migration in favor of ECR (per staging-verify.yml header @@ -62,7 +62,7 @@ jobs: # See issue #576 + infra-lead pulse ~00:30Z. runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/publish-runtime-autobump.yml b/.gitea/workflows/publish-runtime-autobump.yml index e807c9fb..0fda438c 100644 --- a/.gitea/workflows/publish-runtime-autobump.yml +++ b/.gitea/workflows/publish-runtime-autobump.yml @@ -24,7 +24,7 @@ name: publish-runtime-autobump on: # Run on PR pushes to post a success status so Gitea can merge the PR. - # All steps use continue-on-error: true so operational failures + # All steps use continue-on-error: false so operational failures # (PyPI unreachable, DISPATCH_TOKEN missing) do not block merge. pull_request: paths: @@ -51,11 +51,11 @@ concurrency: jobs: # PR-validation path: always succeeds so Gitea can merge workflow-only PRs. # Operational failures (PyPI unreachable, missing DISPATCH_TOKEN) are - # surfaced via continue-on-error: true rather than blocking the merge. + # surfaced via continue-on-error: false rather than blocking the merge. # The actual bump work happens on the main/staging push after merge. pr-validate: runs-on: ubuntu-latest - continue-on-error: true # do not block PR merge on operational failures + continue-on-error: false # do not block PR merge on operational failures steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: diff --git a/.gitea/workflows/railway-pin-audit.yml b/.gitea/workflows/railway-pin-audit.yml index 58f4809e..6d8a3728 100644 --- a/.gitea/workflows/railway-pin-audit.yml +++ b/.gitea/workflows/railway-pin-audit.yml @@ -15,7 +15,7 @@ name: Railway pin audit (drift detection) # - Workflow-level env.GITHUB_SERVER_URL set so the curl calls can # derive `git.moleculesai.app` from the runner env (with # hard-coded fallback inside the steps). -# - `continue-on-error: true` on the job (RFC §1 contract). +# - `continue-on-error: false` on the job (RFC §1 contract). # # Daily audit of Railway env vars for drift-prone image-tag pins — # automation-cadence layer over the detection script + regression test @@ -51,7 +51,7 @@ jobs: name: Audit Railway env vars for drift-prone pins runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 10 steps: diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 6cd8f8a3..8bff6a5b 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -8,7 +8,7 @@ name: redeploy-tenants-on-main # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with # push+paths filter per this PR. Gitea 1.22.6 does not support # `workflow_run` (task #81). The push trigger fires on every @@ -86,7 +86,7 @@ jobs: if: ${{ github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 25 steps: - name: Note on ECR propagation diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml index 40c4894d..94f0e183 100644 --- a/.gitea/workflows/redeploy-tenants-on-staging.yml +++ b/.gitea/workflows/redeploy-tenants-on-staging.yml @@ -8,7 +8,7 @@ name: redeploy-tenants-on-staging # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with # push+paths filter per this PR. Gitea 1.22.6 does not support # `workflow_run` (task #81). The push trigger fires on every @@ -76,7 +76,7 @@ jobs: redeploy: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 25 steps: - name: Wait for GHCR tag propagation diff --git a/.gitea/workflows/review-check-tests.yml b/.gitea/workflows/review-check-tests.yml index df57aad5..bcaafbe5 100644 --- a/.gitea/workflows/review-check-tests.yml +++ b/.gitea/workflows/review-check-tests.yml @@ -53,7 +53,7 @@ jobs: # runners with internet access to package mirrors). Falls back to GitHub # binary download. GitHub releases may be blocked on some runner networks # (infra#241 follow-up). - continue-on-error: true + continue-on-error: false run: | if apt-get update -qq && apt-get install -y -qq jq; then echo "::notice::jq installed via apt-get: $(jq --version)" diff --git a/.gitea/workflows/runtime-pin-compat.yml b/.gitea/workflows/runtime-pin-compat.yml index 6fe493d1..6896d90f 100644 --- a/.gitea/workflows/runtime-pin-compat.yml +++ b/.gitea/workflows/runtime-pin-compat.yml @@ -12,7 +12,7 @@ name: Runtime Pin Compatibility # - on.paths references .gitea/workflows/runtime-pin-compat.yml (this # file) instead of the .github/ one. # - Workflow-level env.GITHUB_SERVER_URL set. -# - `continue-on-error: true` on the job (RFC §1 contract). +# - `continue-on-error: false` on the job (RFC §1 contract). # # CI gate that prevents the 5-hour staging outage from 2026-04-24 from # recurring (controlplane#253). The original failure mode: @@ -67,7 +67,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. - continue-on-error: true + continue-on-error: false steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 diff --git a/.gitea/workflows/runtime-prbuild-compat.yml b/.gitea/workflows/runtime-prbuild-compat.yml index 71145434..f9e6f7d6 100644 --- a/.gitea/workflows/runtime-prbuild-compat.yml +++ b/.gitea/workflows/runtime-prbuild-compat.yml @@ -11,7 +11,7 @@ name: Runtime PR-Built Compatibility # pattern for ci.yml port). # - on.paths references .gitea/workflows/runtime-prbuild-compat.yml. # - Workflow-level env.GITHUB_SERVER_URL set. -# - `continue-on-error: true` on every job (RFC §1 contract). +# - `continue-on-error: false` on every job (RFC §1 contract). # # Companion to `runtime-pin-compat.yml`. That workflow tests what's # CURRENTLY PUBLISHED on PyPI; this workflow tests what WOULD BE @@ -52,7 +52,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false outputs: wheel: ${{ steps.decide.outputs.wheel }} steps: @@ -96,7 +96,7 @@ jobs: name: PR-built wheel + import smoke runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false steps: - name: No-op pass (paths filter excluded this commit) if: needs.detect-changes.outputs.wheel != 'true' diff --git a/.gitea/workflows/secret-pattern-drift.yml b/.gitea/workflows/secret-pattern-drift.yml index a2520b54..d31d63b5 100644 --- a/.gitea/workflows/secret-pattern-drift.yml +++ b/.gitea/workflows/secret-pattern-drift.yml @@ -9,7 +9,7 @@ name: SECRET_PATTERNS drift lint # - CANONICAL_FILE inside scripts/lint_secret_pattern_drift.py was # updated in the same Cat C-1 PR to point at .gitea/workflows/secret-scan.yml. # - Workflow-level env.GITHUB_SERVER_URL set. -# - `continue-on-error: true` on the job (RFC §1 contract). +# - `continue-on-error: false` on the job (RFC §1 contract). # # Detects when the canonical SECRET_PATTERNS array in # .gitea/workflows/secret-scan.yml diverges from known consumer @@ -57,7 +57,7 @@ jobs: name: Detect SECRET_PATTERNS drift runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false timeout-minutes: 5 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/sop-tier-check.yml b/.gitea/workflows/sop-tier-check.yml index d3f7aefb..2115a771 100644 --- a/.gitea/workflows/sop-tier-check.yml +++ b/.gitea/workflows/sop-tier-check.yml @@ -32,10 +32,10 @@ # for PRs in-flight when AND-composition deployed. # Burn-in: remove after 2026-05-17 (7-day window). # -# BURN-IN NOTE (internal#189 Phase 1): continue-on-error: true is set on +# BURN-IN NOTE (internal#189 Phase 1): continue-on-error: false is set on # the tier-check job below. This prevents AND-composition from blocking # PRs during the 7-day burn-in. After 2026-05-17: -# 1. Remove `continue-on-error: true` from this job block. +# 1. Remove `continue-on-error: false` from this job block. # 2. Update this BURN-IN NOTE comment to mark the window closed. name: sop-tier-check @@ -65,7 +65,7 @@ jobs: runs-on: ubuntu-latest # BURN-IN: continue-on-error prevents AND-composition from blocking # PRs during the 7-day window. Remove after 2026-05-17 (internal#189). - continue-on-error: true + continue-on-error: false permissions: contents: read pull-requests: read @@ -87,9 +87,9 @@ jobs: # GitHub releases may be unreachable from some runner networks # (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188 # runners). The sop-tier-check script has its own fallback as a - # third line of defense. continue-on-error: true ensures this step + # third line of defense. continue-on-error: false ensures this step # failing does not block the job. - continue-on-error: true + continue-on-error: false run: | # apt-get is the primary method — Ubuntu package mirrors are reliably # reachable from runner containers. GitHub releases may be blocked @@ -106,10 +106,10 @@ jobs: jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry" - name: Verify tier label + reviewer team membership - # continue-on-error: true at step level — job-level is ignored by Gitea + # continue-on-error: false at step level — job-level is ignored by Gitea # Actions (quirk #10, internal runbooks). Belt-and-suspenders with # SOP_FAIL_OPEN=1 + || true below. - continue-on-error: true + continue-on-error: false env: GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }} GITEA_HOST: git.moleculesai.app @@ -119,7 +119,7 @@ jobs: SOP_DEBUG: '0' SOP_LEGACY_CHECK: '0' # SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces - # the actual merge gate. Combined with continue-on-error: true + # the actual merge gate. Combined with continue-on-error: false # above, this step never fails the job regardless of script exit. SOP_FAIL_OPEN: '1' run: | diff --git a/.gitea/workflows/staging-smoke.yml b/.gitea/workflows/staging-smoke.yml index 623c47ff..2805e984 100644 --- a/.gitea/workflows/staging-smoke.yml +++ b/.gitea/workflows/staging-smoke.yml @@ -10,7 +10,7 @@ name: Staging SaaS smoke (every 30 min) # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Minimum viable health check: provisions one Hermes workspace on a fresh @@ -52,7 +52,7 @@ jobs: smoke: name: Staging SaaS smoke runs-on: ubuntu-latest - # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed + # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: false` removed # 2026-05-11. The "surface broken workflows without blocking" # rationale was correctly applied to advisory/lint workflows but # wrong for this smoke — it is the 30-min canary cadence for the @@ -61,7 +61,7 @@ jobs: # drift, WorkOS session breakage, secret rotations). Same class of # failure as PR#461 (`sweep-stale-e2e-orgs`) where Phase-3 silent # failure leaked EC2. The four other `e2e-staging-*` workflows - # KEEP `continue-on-error: true` per RFC #219 §1 — they are + # KEEP `continue-on-error: false` per RFC #219 §1 — they are # advisory and matrix-style; this one is the canary. A follow-up # `notify-failure` step below also surfaces breakage to ops even # if branch-protection wiring is adjusted to keep this off the @@ -333,7 +333,7 @@ jobs: exit 0 - name: Notify on smoke failure - # Fail-loud companion to dropping `continue-on-error: true`. + # Fail-loud companion to dropping `continue-on-error: false`. # The Open-issue-on-failure step above handles the human-facing # alert; this step emits a clearly-tagged ::error:: line that # log-tail consumers (Loki SOPRefireRule, orchestrator triage diff --git a/.gitea/workflows/staging-verify.yml b/.gitea/workflows/staging-verify.yml index 7aeaadcd..06c182cb 100644 --- a/.gitea/workflows/staging-verify.yml +++ b/.gitea/workflows/staging-verify.yml @@ -10,7 +10,7 @@ name: Staging verify # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with # push+paths filter per this PR. Gitea 1.22.6 does not support # `workflow_run` (task #81). The push trigger fires on every @@ -85,7 +85,7 @@ jobs: staging-smoke: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false outputs: sha: ${{ steps.compute.outputs.sha }} smoke_ran: ${{ steps.smoke.outputs.ran }} @@ -205,7 +205,7 @@ jobs: if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false env: SHA: ${{ needs.staging-smoke.outputs.sha }} CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }} diff --git a/.gitea/workflows/sweep-aws-secrets.yml b/.gitea/workflows/sweep-aws-secrets.yml index 5544a7db..f53d5110 100644 --- a/.gitea/workflows/sweep-aws-secrets.yml +++ b/.gitea/workflows/sweep-aws-secrets.yml @@ -8,7 +8,7 @@ name: Sweep stale AWS Secrets Manager secrets # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Janitor for per-tenant AWS Secrets Manager secrets @@ -29,15 +29,11 @@ name: Sweep stale AWS Secrets Manager secrets # reconciler enumerator) is filed as a separate controlplane # issue. This sweeper is the immediate cost-relief stopgap. # -# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID / -# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same -# credentials used by the rest of the platform. The dedicated -# AWS_JANITOR_* naming (which the original GitHub workflow used) was -# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID / -# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have -# secretsmanager:ListSecrets (the production molecule-cp principal); -# if ListSecrets is revoked in future, a dedicated janitor principal -# would need to be created and the Gitea secret names updated here. +# AWS credentials: use the dedicated Secrets Manager janitor principal. +# Do not fall back to the molecule-cp application principal: it does +# not need account-wide ListSecrets, and a 2026-05-12 CI failure proved +# that using it here turns a least-privilege production credential into +# a red scheduled janitor. # # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike @@ -65,7 +61,7 @@ jobs: name: Sweep AWS Secrets Manager runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false # 30 min cap, mirroring the other janitors. AWS DeleteSecret is # fast (~0.3s/call) so even a 100+ backlog drains in seconds # under the 8-way xargs parallelism, but the cap is set generously @@ -73,8 +69,8 @@ jobs: timeout-minutes: 30 env: AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_SECRETS_JANITOR_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRETS_JANITOR_SECRET_ACCESS_KEY }} CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }} diff --git a/.gitea/workflows/sweep-cf-orphans.yml b/.gitea/workflows/sweep-cf-orphans.yml index 28af2537..9500a9d8 100644 --- a/.gitea/workflows/sweep-cf-orphans.yml +++ b/.gitea/workflows/sweep-cf-orphans.yml @@ -8,7 +8,7 @@ name: Sweep stale Cloudflare DNS records # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Janitor for Cloudflare DNS records whose backing tenant/workspace no @@ -71,7 +71,7 @@ jobs: name: Sweep CF orphans runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck) # within one cron interval instead of burning a full tick. Realistic # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE diff --git a/.gitea/workflows/sweep-cf-tunnels.yml b/.gitea/workflows/sweep-cf-tunnels.yml index d1828ab2..3a476c20 100644 --- a/.gitea/workflows/sweep-cf-tunnels.yml +++ b/.gitea/workflows/sweep-cf-tunnels.yml @@ -8,7 +8,7 @@ name: Sweep stale Cloudflare Tunnels # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Janitor for Cloudflare Tunnels whose backing tenant no longer @@ -55,7 +55,7 @@ jobs: name: Sweep CF tunnels runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false # 30 min cap. Was 5 min on the theory that the only thing that # could take >5min is a CF-API hang — but on 2026-05-02 a backlog # of 672 stale tunnels accumulated (large staging E2E run + delayed diff --git a/.gitea/workflows/sweep-stale-e2e-orgs.yml b/.gitea/workflows/sweep-stale-e2e-orgs.yml index 8ba68fba..2b37b0f8 100644 --- a/.gitea/workflows/sweep-stale-e2e-orgs.yml +++ b/.gitea/workflows/sweep-stale-e2e-orgs.yml @@ -8,7 +8,7 @@ name: Sweep stale e2e-* orgs (staging) # - Dropped `environment:` blocks (Gitea has no environments). # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. -# - `continue-on-error: true` on each job (RFC §1 contract). +# - `continue-on-error: false` on each job (RFC §1 contract). # # Janitor for staging tenants left behind when E2E cleanup didn't run: @@ -63,7 +63,7 @@ jobs: sweep: name: Sweep e2e orgs runs-on: ubuntu-latest - # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed + # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: false` removed # 2026-05-11. The "surface broken workflows without blocking" # rationale was correctly applied to advisory/lint workflows but # wrong for this janitor — silent failure here masks real-money @@ -253,7 +253,7 @@ jobs: echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete." - name: Notify on sweep failure - # Fail-loud companion to dropping `continue-on-error: true`. + # Fail-loud companion to dropping `continue-on-error: false`. # If any prior step failed (missing token, CP 5xx, safety-cap # tripped, etc.) emit a clearly-tagged ::error:: line so the # Gitea runs UI + any log-tail consumer (Loki SOPRefireRule) diff --git a/.gitea/workflows/test-ops-scripts.yml b/.gitea/workflows/test-ops-scripts.yml index 1a676deb..88de906c 100644 --- a/.gitea/workflows/test-ops-scripts.yml +++ b/.gitea/workflows/test-ops-scripts.yml @@ -8,7 +8,7 @@ name: Ops Scripts Tests # - on.paths references .gitea/workflows/test-ops-scripts.yml (this # file) instead of the .github/ one. # - Workflow-level env.GITHUB_SERVER_URL set. -# - `continue-on-error: true` on the job (RFC §1 contract). +# - `continue-on-error: false` on the job (RFC §1 contract). # # Runs the unittest suite for scripts/ on every PR + push that touches # anything under scripts/. Kept separate from the main CI so a script-only @@ -46,7 +46,7 @@ jobs: name: Ops scripts (unittest) runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. - continue-on-error: true + continue-on-error: false steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 diff --git a/.gitea/workflows/weekly-platform-go.yml b/.gitea/workflows/weekly-platform-go.yml index 09ba7d8e..f86ec8cb 100644 --- a/.gitea/workflows/weekly-platform-go.yml +++ b/.gitea/workflows/weekly-platform-go.yml @@ -11,7 +11,7 @@ name: Weekly Platform-Go Surface # # This workflow runs the full suite (build, vet, golangci-lint, tests with # coverage) every Monday at 04:17 UTC. Results are posted as commit statuses -# but continue-on-error: true means they never block anything — they're +# but continue-on-error: false means they never block anything — they're # purely a noise-reduction signal for when the next workspace-server push # lands and would otherwise trigger the first real suite run. # @@ -31,7 +31,7 @@ jobs: name: Weekly Platform-Go Surface runs-on: ubuntu-latest # continue-on-error: surface only, never block - continue-on-error: true + continue-on-error: false defaults: run: working-directory: workspace-server diff --git a/scripts/ops/sweep-aws-secrets.sh b/scripts/ops/sweep-aws-secrets.sh index 20450026..3acd0bbf 100755 --- a/scripts/ops/sweep-aws-secrets.sh +++ b/scripts/ops/sweep-aws-secrets.sh @@ -239,9 +239,9 @@ for s in d.get("SecretList", []): # --- Summarize + safety gate ---------------------------------------------- -DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT)) -TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c " +TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c " import json, sys n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret') print(n) @@ -256,7 +256,7 @@ log " would keep: $KEEP_COUNT" log "" # Per-reason breakdown of deletes + keep-categories worth seeing -echo "$DECISIONS" | python3 -c " +printf '%s' "$DECISIONS" | python3 -c " import json,sys,collections delete_c = collections.Counter() keep_c = collections.Counter() @@ -291,7 +291,7 @@ if [ "$DRY_RUN" = "1" ]; then log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets." log "" log "First 20 secrets that would be deleted:" - echo "$DECISIONS" | python3 -c " + printf '%s' "$DECISIONS" | python3 -c " import json, sys shown = 0 for l in sys.stdin: @@ -327,7 +327,7 @@ RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX) # Build delete plan (one ARN per line) and id→name side-channel for # failure-log readability. Use ARN rather than Name on the delete # call because Name is mutable; ARN is the stable identifier. -echo "$DECISIONS" | python3 -c ' +printf '%s' "$DECISIONS" | python3 -c ' import json, sys plan_path = sys.argv[1] map_path = sys.argv[2] diff --git a/scripts/ops/sweep-cf-tunnels.sh b/scripts/ops/sweep-cf-tunnels.sh index 13734db3..063b989a 100755 --- a/scripts/ops/sweep-cf-tunnels.sh +++ b/scripts/ops/sweep-cf-tunnels.sh @@ -195,9 +195,9 @@ for t in d.get("result", []): # --- Summarize + safety gate ---------------------------------------------- -DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT)) -TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c " +TENANT_TUNNELS=$(printf '%s' "$DECISIONS" | python3 -c " import json, sys n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel') print(n) @@ -212,7 +212,7 @@ log " would keep: $KEEP_COUNT" log "" # Per-reason breakdown of deletes -echo "$DECISIONS" | python3 -c " +printf '%s' "$DECISIONS" | python3 -c " import json,sys,collections c = collections.Counter() for l in sys.stdin: @@ -242,7 +242,7 @@ if [ "$DRY_RUN" = "1" ]; then log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels." log "" log "First 20 tunnels that would be deleted:" - echo "$DECISIONS" | python3 -c " + printf '%s' "$DECISIONS" | python3 -c " import json, sys shown = 0 for l in sys.stdin: @@ -283,7 +283,7 @@ RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX) # Build delete plan (just ids, one per line) and the side-channel # id→name map (tab-separated). -echo "$DECISIONS" | python3 -c ' +printf '%s' "$DECISIONS" | python3 -c ' import json, os, sys plan_path = sys.argv[1] map_path = sys.argv[2]