From 3ec707aecee0d50fd6c97db07fa46ec3cdb07c4e Mon Sep 17 00:00:00 2001 From: hongming-codex-laptop Date: Tue, 12 May 2026 16:10:53 -0700 Subject: [PATCH] fix(ci): repair scheduled main janitors and track masks --- .gitea/workflows/block-internal-paths.yml | 1 + .gitea/workflows/cascade-list-drift-gate.yml | 1 + .../workflows/check-migration-collisions.yml | 1 + .gitea/workflows/ci.yml | 3 ++ .gitea/workflows/continuous-synth-e2e.yml | 1 + .gitea/workflows/e2e-api.yml | 2 ++ .gitea/workflows/e2e-staging-canvas.yml | 2 ++ .gitea/workflows/e2e-staging-external.yml | 1 + .gitea/workflows/e2e-staging-saas.yml | 4 +++ .gitea/workflows/e2e-staging-sanity.yml | 1 + .gitea/workflows/gate-check-v3.yml | 34 ++++++++++++------- .../handlers-postgres-integration.yml | 2 ++ .gitea/workflows/harness-replays.yml | 2 ++ .../lint-continue-on-error-tracking.yml | 1 + .gitea/workflows/lint-curl-status-capture.yml | 1 + .gitea/workflows/lint-mask-pr-atomicity.yml | 1 + .gitea/workflows/lint-workflow-yaml.yml | 1 + .gitea/workflows/publish-canvas-image.yml | 1 + .gitea/workflows/publish-runtime-autobump.yml | 1 + .gitea/workflows/railway-pin-audit.yml | 1 + .gitea/workflows/redeploy-tenants-on-main.yml | 1 + .../workflows/redeploy-tenants-on-staging.yml | 1 + .gitea/workflows/review-check-tests.yml | 1 + .gitea/workflows/runtime-pin-compat.yml | 1 + .gitea/workflows/runtime-prbuild-compat.yml | 2 ++ .gitea/workflows/secret-pattern-drift.yml | 1 + .gitea/workflows/sop-tier-check.yml | 3 ++ .gitea/workflows/staging-verify.yml | 2 ++ .gitea/workflows/sweep-aws-secrets.yml | 19 +++++------ .gitea/workflows/sweep-cf-orphans.yml | 1 + .gitea/workflows/sweep-cf-tunnels.yml | 1 + .gitea/workflows/test-ops-scripts.yml | 1 + .gitea/workflows/weekly-platform-go.yml | 1 + scripts/ops/sweep-aws-secrets.sh | 10 +++--- scripts/ops/sweep-cf-tunnels.sh | 10 +++--- 35 files changed, 83 insertions(+), 34 deletions(-) diff --git a/.gitea/workflows/block-internal-paths.yml b/.gitea/workflows/block-internal-paths.yml index ed60e7e4..80ffdc41 100644 --- a/.gitea/workflows/block-internal-paths.yml +++ b/.gitea/workflows/block-internal-paths.yml @@ -37,6 +37,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/cascade-list-drift-gate.yml b/.gitea/workflows/cascade-list-drift-gate.yml index 99b8e8bb..929ae121 100644 --- a/.gitea/workflows/cascade-list-drift-gate.yml +++ b/.gitea/workflows/cascade-list-drift-gate.yml @@ -48,6 +48,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 diff --git a/.gitea/workflows/check-migration-collisions.yml b/.gitea/workflows/check-migration-collisions.yml index e2aed7f5..dc9970cc 100644 --- a/.gitea/workflows/check-migration-collisions.yml +++ b/.gitea/workflows/check-migration-collisions.yml @@ -45,6 +45,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 5 steps: diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 52f65a3b..41b8ceb6 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -148,6 +148,7 @@ jobs: # a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing. # Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint) # retain continue-on-error: false; only platform-build regresses. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709) defaults: run: @@ -186,6 +187,7 @@ jobs: echo "::group::pendinguploads exit=$pu_exit (last 100 lines)" tail -100 /tmp/test-pu.log echo "::endgroup::" + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true - if: needs.changes.outputs.platform == 'true' name: Run tests with race detection and coverage @@ -372,6 +374,7 @@ jobs: canvas-deploy-reminder: name: Canvas Deploy Reminder runs-on: ubuntu-latest + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true needs: [changes, canvas-build] # Only fires on direct pushes to main (i.e. after staging→main promotion). diff --git a/.gitea/workflows/continuous-synth-e2e.yml b/.gitea/workflows/continuous-synth-e2e.yml index 6b3c72b6..37b9a78d 100644 --- a/.gitea/workflows/continuous-synth-e2e.yml +++ b/.gitea/workflows/continuous-synth-e2e.yml @@ -90,6 +90,7 @@ jobs: name: Synthetic E2E against staging runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase # (apt-get update + install docker.io/jq/awscli/caddy + snap install diff --git a/.gitea/workflows/e2e-api.yml b/.gitea/workflows/e2e-api.yml index 6f82e080..2d25a91a 100644 --- a/.gitea/workflows/e2e-api.yml +++ b/.gitea/workflows/e2e-api.yml @@ -103,6 +103,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true outputs: api: ${{ steps.decide.outputs.api }} @@ -154,6 +155,7 @@ jobs: name: E2E API Smoke Test runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 15 env: diff --git a/.gitea/workflows/e2e-staging-canvas.yml b/.gitea/workflows/e2e-staging-canvas.yml index 9b4f1475..02bad3b1 100644 --- a/.gitea/workflows/e2e-staging-canvas.yml +++ b/.gitea/workflows/e2e-staging-canvas.yml @@ -70,6 +70,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true outputs: canvas: ${{ steps.decide.outputs.canvas }} @@ -118,6 +119,7 @@ jobs: name: Canvas tabs E2E runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 40 diff --git a/.gitea/workflows/e2e-staging-external.yml b/.gitea/workflows/e2e-staging-external.yml index 6c4e4b91..1e28be30 100644 --- a/.gitea/workflows/e2e-staging-external.yml +++ b/.gitea/workflows/e2e-staging-external.yml @@ -84,6 +84,7 @@ jobs: name: E2E Staging External Runtime runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 25 diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index 306e561d..b180d167 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -88,17 +88,20 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 1 + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.11" + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true - name: YAML validation (best-effort) run: | echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid." echo "E2E step runs only when provisioning-critical files change." + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only @@ -109,6 +112,7 @@ jobs: # Only runs on trunk pushes. PR paths get pr-validate instead. if: github.event.pull_request.base.ref == '' # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 45 permissions: diff --git a/.gitea/workflows/e2e-staging-sanity.yml b/.gitea/workflows/e2e-staging-sanity.yml index bf878a88..8077da76 100644 --- a/.gitea/workflows/e2e-staging-sanity.yml +++ b/.gitea/workflows/e2e-staging-sanity.yml @@ -37,6 +37,7 @@ jobs: name: Intentional-failure teardown sanity runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 20 diff --git a/.gitea/workflows/gate-check-v3.yml b/.gitea/workflows/gate-check-v3.yml index aaa37153..f2e2c959 100644 --- a/.gitea/workflows/gate-check-v3.yml +++ b/.gitea/workflows/gate-check-v3.yml @@ -46,6 +46,7 @@ env: jobs: gate-check: runs-on: ubuntu-latest + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # Never block on our own detector failing steps: - name: Check out BASE ref (never PR-head under pull_request_target) @@ -76,25 +77,32 @@ jobs: if: github.event_name == 'schedule' env: GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} run: | set -euo pipefail # Fetch all open PRs and run gate-check on each # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN. # gate_check.py uses timeout=15 on every urlopen call; this catches the # inline Python polling loop too (issue #603). - pr_numbers=$(python3 -c " - import socket, urllib.request, json, os - socket.setdefaulttimeout(15) - token = os.environ['GITEA_TOKEN'] - req = urllib.request.Request( - 'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100', - headers={'Authorization': f'token {token}', 'Accept': 'application/json'} - ) - with urllib.request.urlopen(req) as r: - prs = json.loads(r.read()) - for pr in prs: - print(pr['number']) - ") + pr_numbers=$(python3 <<'PY' + import json + import os + import socket + import urllib.request + + socket.setdefaulttimeout(15) + token = os.environ["GITEA_TOKEN"] + repo = os.environ["REPO"] + req = urllib.request.Request( + f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100", + headers={"Authorization": f"token {token}", "Accept": "application/json"}, + ) + with urllib.request.urlopen(req) as r: + prs = json.loads(r.read()) + for pr in prs: + print(pr["number"]) + PY + ) for pr in $pr_numbers; do echo "Checking PR #$pr..." python3 tools/gate-check-v3/gate_check.py \ diff --git a/.gitea/workflows/handlers-postgres-integration.yml b/.gitea/workflows/handlers-postgres-integration.yml index fcebdde1..db772fe9 100644 --- a/.gitea/workflows/handlers-postgres-integration.yml +++ b/.gitea/workflows/handlers-postgres-integration.yml @@ -79,6 +79,7 @@ jobs: name: detect-changes runs-on: ubuntu-latest # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true outputs: handlers: ${{ steps.filter.outputs.handlers }} @@ -119,6 +120,7 @@ jobs: needs: detect-changes runs-on: ubuntu-latest # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true env: # Unique name per run so concurrent jobs don't collide on the diff --git a/.gitea/workflows/harness-replays.yml b/.gitea/workflows/harness-replays.yml index f83d03b1..5925adb5 100644 --- a/.gitea/workflows/harness-replays.yml +++ b/.gitea/workflows/harness-replays.yml @@ -63,6 +63,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true outputs: run: ${{ steps.decide.outputs.run }} @@ -154,6 +155,7 @@ jobs: name: Harness Replays runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 30 steps: diff --git a/.gitea/workflows/lint-continue-on-error-tracking.yml b/.gitea/workflows/lint-continue-on-error-tracking.yml index cd3a59a0..f20b7f4f 100644 --- a/.gitea/workflows/lint-continue-on-error-tracking.yml +++ b/.gitea/workflows/lint-continue-on-error-tracking.yml @@ -97,6 +97,7 @@ jobs: # PRs. Pre-existing continue-on-error: true directives on main # all violate this lint at first — intentional. Flip to false # follow-up after main is clean for 3 days. internal#350. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # internal#350 Phase 3 mask — 14d forced-renewal cadence steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/lint-curl-status-capture.yml b/.gitea/workflows/lint-curl-status-capture.yml index 99f3f4c0..620fbfd1 100644 --- a/.gitea/workflows/lint-curl-status-capture.yml +++ b/.gitea/workflows/lint-curl-status-capture.yml @@ -45,6 +45,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/lint-mask-pr-atomicity.yml b/.gitea/workflows/lint-mask-pr-atomicity.yml index 2aa58388..c2ab0dd0 100644 --- a/.gitea/workflows/lint-mask-pr-atomicity.yml +++ b/.gitea/workflows/lint-mask-pr-atomicity.yml @@ -92,6 +92,7 @@ jobs: # PRs. Follow-up PR flips this to `false` once recent runs on main # are confirmed clean (eat-our-own-dogfood discipline mirrors # PR#673's same-shape comment). Tracking: internal#350. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - name: Check out PR head with full history (need base SHA blobs) diff --git a/.gitea/workflows/lint-workflow-yaml.yml b/.gitea/workflows/lint-workflow-yaml.yml index 1b2b7120..3d71875b 100644 --- a/.gitea/workflows/lint-workflow-yaml.yml +++ b/.gitea/workflows/lint-workflow-yaml.yml @@ -55,6 +55,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs. # Follow-up PR flips this off after the 4 existing-on-main rule-2 # (workflow_run) violations are migrated to a supported trigger. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index 0438c33d..e9b30803 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -62,6 +62,7 @@ jobs: # See issue #576 + infra-lead pulse ~00:30Z. runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - name: Checkout diff --git a/.gitea/workflows/publish-runtime-autobump.yml b/.gitea/workflows/publish-runtime-autobump.yml index e807c9fb..1452fd81 100644 --- a/.gitea/workflows/publish-runtime-autobump.yml +++ b/.gitea/workflows/publish-runtime-autobump.yml @@ -55,6 +55,7 @@ jobs: # The actual bump work happens on the main/staging push after merge. pr-validate: runs-on: ubuntu-latest + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # do not block PR merge on operational failures steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/railway-pin-audit.yml b/.gitea/workflows/railway-pin-audit.yml index 58f4809e..cb1c56c4 100644 --- a/.gitea/workflows/railway-pin-audit.yml +++ b/.gitea/workflows/railway-pin-audit.yml @@ -51,6 +51,7 @@ jobs: name: Audit Railway env vars for drift-prone pins runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 10 diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 6cd8f8a3..1dcfced5 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -86,6 +86,7 @@ jobs: if: ${{ github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 25 steps: diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml index 40c4894d..35c1a979 100644 --- a/.gitea/workflows/redeploy-tenants-on-staging.yml +++ b/.gitea/workflows/redeploy-tenants-on-staging.yml @@ -76,6 +76,7 @@ jobs: redeploy: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 25 steps: diff --git a/.gitea/workflows/review-check-tests.yml b/.gitea/workflows/review-check-tests.yml index df57aad5..1030a2c5 100644 --- a/.gitea/workflows/review-check-tests.yml +++ b/.gitea/workflows/review-check-tests.yml @@ -53,6 +53,7 @@ jobs: # runners with internet access to package mirrors). Falls back to GitHub # binary download. GitHub releases may be blocked on some runner networks # (infra#241 follow-up). + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true run: | if apt-get update -qq && apt-get install -y -qq jq; then diff --git a/.gitea/workflows/runtime-pin-compat.yml b/.gitea/workflows/runtime-pin-compat.yml index 6fe493d1..00ab6bc0 100644 --- a/.gitea/workflows/runtime-pin-compat.yml +++ b/.gitea/workflows/runtime-pin-compat.yml @@ -67,6 +67,7 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking # the PR. Follow-up PR flips this off after surfaced defects are # triaged. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/runtime-prbuild-compat.yml b/.gitea/workflows/runtime-prbuild-compat.yml index 71145434..6df67131 100644 --- a/.gitea/workflows/runtime-prbuild-compat.yml +++ b/.gitea/workflows/runtime-prbuild-compat.yml @@ -52,6 +52,7 @@ jobs: detect-changes: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true outputs: wheel: ${{ steps.decide.outputs.wheel }} @@ -96,6 +97,7 @@ jobs: name: PR-built wheel + import smoke runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - name: No-op pass (paths filter excluded this commit) diff --git a/.gitea/workflows/secret-pattern-drift.yml b/.gitea/workflows/secret-pattern-drift.yml index a2520b54..b3430785 100644 --- a/.gitea/workflows/secret-pattern-drift.yml +++ b/.gitea/workflows/secret-pattern-drift.yml @@ -57,6 +57,7 @@ jobs: name: Detect SECRET_PATTERNS drift runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 5 steps: diff --git a/.gitea/workflows/sop-tier-check.yml b/.gitea/workflows/sop-tier-check.yml index d3f7aefb..427e479a 100644 --- a/.gitea/workflows/sop-tier-check.yml +++ b/.gitea/workflows/sop-tier-check.yml @@ -65,6 +65,7 @@ jobs: runs-on: ubuntu-latest # BURN-IN: continue-on-error prevents AND-composition from blocking # PRs during the 7-day window. Remove after 2026-05-17 (internal#189). + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true permissions: contents: read @@ -89,6 +90,7 @@ jobs: # runners). The sop-tier-check script has its own fallback as a # third line of defense. continue-on-error: true ensures this step # failing does not block the job. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true run: | # apt-get is the primary method — Ubuntu package mirrors are reliably @@ -109,6 +111,7 @@ jobs: # continue-on-error: true at step level — job-level is ignored by Gitea # Actions (quirk #10, internal runbooks). Belt-and-suspenders with # SOP_FAIL_OPEN=1 + || true below. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true env: GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }} diff --git a/.gitea/workflows/staging-verify.yml b/.gitea/workflows/staging-verify.yml index 7aeaadcd..42ea3e84 100644 --- a/.gitea/workflows/staging-verify.yml +++ b/.gitea/workflows/staging-verify.yml @@ -85,6 +85,7 @@ jobs: staging-smoke: runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true outputs: sha: ${{ steps.compute.outputs.sha }} @@ -205,6 +206,7 @@ jobs: if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true env: SHA: ${{ needs.staging-smoke.outputs.sha }} diff --git a/.gitea/workflows/sweep-aws-secrets.yml b/.gitea/workflows/sweep-aws-secrets.yml index 5544a7db..ebdf626f 100644 --- a/.gitea/workflows/sweep-aws-secrets.yml +++ b/.gitea/workflows/sweep-aws-secrets.yml @@ -29,15 +29,11 @@ name: Sweep stale AWS Secrets Manager secrets # reconciler enumerator) is filed as a separate controlplane # issue. This sweeper is the immediate cost-relief stopgap. # -# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID / -# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same -# credentials used by the rest of the platform. The dedicated -# AWS_JANITOR_* naming (which the original GitHub workflow used) was -# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID / -# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have -# secretsmanager:ListSecrets (the production molecule-cp principal); -# if ListSecrets is revoked in future, a dedicated janitor principal -# would need to be created and the Gitea secret names updated here. +# AWS credentials: use the dedicated Secrets Manager janitor principal. +# Do not fall back to the molecule-cp application principal: it does +# not need account-wide ListSecrets, and a 2026-05-12 CI failure proved +# that using it here turns a least-privilege production credential into +# a red scheduled janitor. # # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike @@ -65,6 +61,7 @@ jobs: name: Sweep AWS Secrets Manager runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # 30 min cap, mirroring the other janitors. AWS DeleteSecret is # fast (~0.3s/call) so even a 100+ backlog drains in seconds @@ -73,8 +70,8 @@ jobs: timeout-minutes: 30 env: AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_SECRETS_JANITOR_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRETS_JANITOR_SECRET_ACCESS_KEY }} CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }} diff --git a/.gitea/workflows/sweep-cf-orphans.yml b/.gitea/workflows/sweep-cf-orphans.yml index 28af2537..5d4e7ef6 100644 --- a/.gitea/workflows/sweep-cf-orphans.yml +++ b/.gitea/workflows/sweep-cf-orphans.yml @@ -71,6 +71,7 @@ jobs: name: Sweep CF orphans runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck) # within one cron interval instead of burning a full tick. Realistic diff --git a/.gitea/workflows/sweep-cf-tunnels.yml b/.gitea/workflows/sweep-cf-tunnels.yml index d1828ab2..fcc34ad9 100644 --- a/.gitea/workflows/sweep-cf-tunnels.yml +++ b/.gitea/workflows/sweep-cf-tunnels.yml @@ -55,6 +55,7 @@ jobs: name: Sweep CF tunnels runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true # 30 min cap. Was 5 min on the theory that the only thing that # could take >5min is a CF-API hang — but on 2026-05-02 a backlog diff --git a/.gitea/workflows/test-ops-scripts.yml b/.gitea/workflows/test-ops-scripts.yml index 1a676deb..af4699d4 100644 --- a/.gitea/workflows/test-ops-scripts.yml +++ b/.gitea/workflows/test-ops-scripts.yml @@ -46,6 +46,7 @@ jobs: name: Ops scripts (unittest) runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.gitea/workflows/weekly-platform-go.yml b/.gitea/workflows/weekly-platform-go.yml index 09ba7d8e..22507e38 100644 --- a/.gitea/workflows/weekly-platform-go.yml +++ b/.gitea/workflows/weekly-platform-go.yml @@ -31,6 +31,7 @@ jobs: name: Weekly Platform-Go Surface runs-on: ubuntu-latest # continue-on-error: surface only, never block + # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true defaults: run: diff --git a/scripts/ops/sweep-aws-secrets.sh b/scripts/ops/sweep-aws-secrets.sh index 20450026..3acd0bbf 100755 --- a/scripts/ops/sweep-aws-secrets.sh +++ b/scripts/ops/sweep-aws-secrets.sh @@ -239,9 +239,9 @@ for s in d.get("SecretList", []): # --- Summarize + safety gate ---------------------------------------------- -DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT)) -TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c " +TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c " import json, sys n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret') print(n) @@ -256,7 +256,7 @@ log " would keep: $KEEP_COUNT" log "" # Per-reason breakdown of deletes + keep-categories worth seeing -echo "$DECISIONS" | python3 -c " +printf '%s' "$DECISIONS" | python3 -c " import json,sys,collections delete_c = collections.Counter() keep_c = collections.Counter() @@ -291,7 +291,7 @@ if [ "$DRY_RUN" = "1" ]; then log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets." log "" log "First 20 secrets that would be deleted:" - echo "$DECISIONS" | python3 -c " + printf '%s' "$DECISIONS" | python3 -c " import json, sys shown = 0 for l in sys.stdin: @@ -327,7 +327,7 @@ RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX) # Build delete plan (one ARN per line) and id→name side-channel for # failure-log readability. Use ARN rather than Name on the delete # call because Name is mutable; ARN is the stable identifier. -echo "$DECISIONS" | python3 -c ' +printf '%s' "$DECISIONS" | python3 -c ' import json, sys plan_path = sys.argv[1] map_path = sys.argv[2] diff --git a/scripts/ops/sweep-cf-tunnels.sh b/scripts/ops/sweep-cf-tunnels.sh index 13734db3..063b989a 100755 --- a/scripts/ops/sweep-cf-tunnels.sh +++ b/scripts/ops/sweep-cf-tunnels.sh @@ -195,9 +195,9 @@ for t in d.get("result", []): # --- Summarize + safety gate ---------------------------------------------- -DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT)) -TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c " +TENANT_TUNNELS=$(printf '%s' "$DECISIONS" | python3 -c " import json, sys n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel') print(n) @@ -212,7 +212,7 @@ log " would keep: $KEEP_COUNT" log "" # Per-reason breakdown of deletes -echo "$DECISIONS" | python3 -c " +printf '%s' "$DECISIONS" | python3 -c " import json,sys,collections c = collections.Counter() for l in sys.stdin: @@ -242,7 +242,7 @@ if [ "$DRY_RUN" = "1" ]; then log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels." log "" log "First 20 tunnels that would be deleted:" - echo "$DECISIONS" | python3 -c " + printf '%s' "$DECISIONS" | python3 -c " import json, sys shown = 0 for l in sys.stdin: @@ -283,7 +283,7 @@ RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX) # Build delete plan (just ids, one per line) and the side-channel # id→name map (tab-separated). -echo "$DECISIONS" | python3 -c ' +printf '%s' "$DECISIONS" | python3 -c ' import json, os, sys plan_path = sys.argv[1] map_path = sys.argv[2]