fix(ci): repair scheduled main janitors and track masks

2026-05-12 16:10:53 -07:00 · 2026-05-12 16:10:53 -07:00 · 3ec707aece
commit 3ec707aece
parent 760e4eb806
35 changed files with 83 additions and 34 deletions
--- a/.gitea/workflows/block-internal-paths.yml
+++ b/.gitea/workflows/block-internal-paths.yml
@ -37,6 +37,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/cascade-list-drift-gate.yml
+++ b/.gitea/workflows/cascade-list-drift-gate.yml
@ -48,6 +48,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
--- a/.gitea/workflows/check-migration-collisions.yml
+++ b/.gitea/workflows/check-migration-collisions.yml
@ -45,6 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 5
    steps:
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@ -148,6 +148,7 @@ jobs:
    # a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing.
    # Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint)
    # retain continue-on-error: false; only platform-build regresses.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709)
    defaults:
      run:
@ -186,6 +187,7 @@ jobs:
          echo "::group::pendinguploads exit=$pu_exit (last 100 lines)"
          tail -100 /tmp/test-pu.log
          echo "::endgroup::"
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
      - if: needs.changes.outputs.platform == 'true'
        name: Run tests with race detection and coverage
@ -372,6 +374,7 @@ jobs:
  canvas-deploy-reminder:
    name: Canvas Deploy Reminder
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
--- a/.gitea/workflows/continuous-synth-e2e.yml
+++ b/.gitea/workflows/continuous-synth-e2e.yml
@ -90,6 +90,7 @@ jobs:
    name: Synthetic E2E against staging
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
    # (apt-get update + install docker.io/jq/awscli/caddy + snap install
--- a/.gitea/workflows/e2e-api.yml
+++ b/.gitea/workflows/e2e-api.yml
@ -103,6 +103,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      api: ${{ steps.decide.outputs.api }}
@ -154,6 +155,7 @@ jobs:
    name: E2E API Smoke Test
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 15
    env:
--- a/.gitea/workflows/e2e-staging-canvas.yml
+++ b/.gitea/workflows/e2e-staging-canvas.yml
@ -70,6 +70,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      canvas: ${{ steps.decide.outputs.canvas }}
@ -118,6 +119,7 @@ jobs:
    name: Canvas tabs E2E
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 40

--- a/.gitea/workflows/e2e-staging-external.yml
+++ b/.gitea/workflows/e2e-staging-external.yml
@ -84,6 +84,7 @@ jobs:
    name: E2E Staging External Runtime
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25

--- a/.gitea/workflows/e2e-staging-saas.yml
+++ b/.gitea/workflows/e2e-staging-saas.yml
@ -88,17 +88,20 @@ jobs:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 1
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

      - name: YAML validation (best-effort)
        run: |
          echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid."
          echo "E2E step runs only when provisioning-critical files change."
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

  # Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only
@ -109,6 +112,7 @@ jobs:
    # Only runs on trunk pushes. PR paths get pr-validate instead.
    if: github.event.pull_request.base.ref == ''
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 45
    permissions:
--- a/.gitea/workflows/e2e-staging-sanity.yml
+++ b/.gitea/workflows/e2e-staging-sanity.yml
@ -37,6 +37,7 @@ jobs:
    name: Intentional-failure teardown sanity
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 20

--- a/.gitea/workflows/gate-check-v3.yml
+++ b/.gitea/workflows/gate-check-v3.yml
@ -46,6 +46,7 @@ env:
 jobs:
  gate-check:
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # Never block on our own detector failing
    steps:
      - name: Check out BASE ref (never PR-head under pull_request_target)
@ -76,25 +77,32 @@ jobs:
        if: github.event_name == 'schedule'
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
        run: |
          set -euo pipefail
          # Fetch all open PRs and run gate-check on each
          # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN.
          # gate_check.py uses timeout=15 on every urlopen call; this catches the
          # inline Python polling loop too (issue #603).
-          pr_numbers=$(python3 -c "
-            import socket, urllib.request, json, os
-            socket.setdefaulttimeout(15)
-            token = os.environ['GITEA_TOKEN']
-            req = urllib.request.Request(
-                'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
-                headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
-            )
-            with urllib.request.urlopen(req) as r:
-                prs = json.loads(r.read())
-            for pr in prs:
-                print(pr['number'])
-          ")
+          pr_numbers=$(python3 <<'PY'
+          import json
+          import os
+          import socket
+          import urllib.request
+
+          socket.setdefaulttimeout(15)
+          token = os.environ["GITEA_TOKEN"]
+          repo = os.environ["REPO"]
+          req = urllib.request.Request(
+              f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100",
+              headers={"Authorization": f"token {token}", "Accept": "application/json"},
+          )
+          with urllib.request.urlopen(req) as r:
+              prs = json.loads(r.read())
+          for pr in prs:
+              print(pr["number"])
+          PY
+          )
          for pr in $pr_numbers; do
            echo "Checking PR #$pr..."
            python3 tools/gate-check-v3/gate_check.py \
--- a/.gitea/workflows/handlers-postgres-integration.yml
+++ b/.gitea/workflows/handlers-postgres-integration.yml
@ -79,6 +79,7 @@ jobs:
    name: detect-changes
    runs-on: ubuntu-latest
    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      handlers: ${{ steps.filter.outputs.handlers }}
@ -119,6 +120,7 @@ jobs:
    needs: detect-changes
    runs-on: ubuntu-latest
    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    env:
      # Unique name per run so concurrent jobs don't collide on the
--- a/.gitea/workflows/harness-replays.yml
+++ b/.gitea/workflows/harness-replays.yml
@ -63,6 +63,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      run: ${{ steps.decide.outputs.run }}
@ -154,6 +155,7 @@ jobs:
    name: Harness Replays
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 30
    steps:
--- a/.gitea/workflows/lint-continue-on-error-tracking.yml
+++ b/.gitea/workflows/lint-continue-on-error-tracking.yml
@ -97,6 +97,7 @@ jobs:
    # PRs. Pre-existing continue-on-error: true directives on main
    # all violate this lint at first — intentional. Flip to false
    # follow-up after main is clean for 3 days. internal#350.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # internal#350 Phase 3 mask — 14d forced-renewal cadence
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
--- a/.gitea/workflows/lint-curl-status-capture.yml
+++ b/.gitea/workflows/lint-curl-status-capture.yml
@ -45,6 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/lint-mask-pr-atomicity.yml
+++ b/.gitea/workflows/lint-mask-pr-atomicity.yml
@ -92,6 +92,7 @@ jobs:
    # PRs. Follow-up PR flips this to `false` once recent runs on main
    # are confirmed clean (eat-our-own-dogfood discipline mirrors
    # PR#673's same-shape comment). Tracking: internal#350.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: Check out PR head with full history (need base SHA blobs)
--- a/.gitea/workflows/lint-workflow-yaml.yml
+++ b/.gitea/workflows/lint-workflow-yaml.yml
@ -55,6 +55,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs.
    # Follow-up PR flips this off after the 4 existing-on-main rule-2
    # (workflow_run) violations are migrated to a supported trigger.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
--- a/.gitea/workflows/publish-canvas-image.yml
+++ b/.gitea/workflows/publish-canvas-image.yml
@ -62,6 +62,7 @@ jobs:
    # See issue #576 + infra-lead pulse ~00:30Z.
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: Checkout
--- a/.gitea/workflows/publish-runtime-autobump.yml
+++ b/.gitea/workflows/publish-runtime-autobump.yml
@ -55,6 +55,7 @@ jobs:
  # The actual bump work happens on the main/staging push after merge.
  pr-validate:
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # do not block PR merge on operational failures
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/railway-pin-audit.yml
+++ b/.gitea/workflows/railway-pin-audit.yml
@ -51,6 +51,7 @@ jobs:
    name: Audit Railway env vars for drift-prone pins
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 10

--- a/.gitea/workflows/redeploy-tenants-on-main.yml
+++ b/.gitea/workflows/redeploy-tenants-on-main.yml
@ -86,6 +86,7 @@ jobs:
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25
    steps:
--- a/.gitea/workflows/redeploy-tenants-on-staging.yml
+++ b/.gitea/workflows/redeploy-tenants-on-staging.yml
@ -76,6 +76,7 @@ jobs:
  redeploy:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25
    steps:
--- a/.gitea/workflows/review-check-tests.yml
+++ b/.gitea/workflows/review-check-tests.yml
@ -53,6 +53,7 @@ jobs:
        # runners with internet access to package mirrors). Falls back to GitHub
        # binary download. GitHub releases may be blocked on some runner networks
        # (infra#241 follow-up).
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        run: |
          if apt-get update -qq && apt-get install -y -qq jq; then
--- a/.gitea/workflows/runtime-pin-compat.yml
+++ b/.gitea/workflows/runtime-pin-compat.yml
@ -67,6 +67,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/runtime-prbuild-compat.yml
+++ b/.gitea/workflows/runtime-prbuild-compat.yml
@ -52,6 +52,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      wheel: ${{ steps.decide.outputs.wheel }}
@ -96,6 +97,7 @@ jobs:
    name: PR-built wheel + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: No-op pass (paths filter excluded this commit)
--- a/.gitea/workflows/secret-pattern-drift.yml
+++ b/.gitea/workflows/secret-pattern-drift.yml
@ -57,6 +57,7 @@ jobs:
    name: Detect SECRET_PATTERNS drift
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 5
    steps:
--- a/.gitea/workflows/sop-tier-check.yml
+++ b/.gitea/workflows/sop-tier-check.yml
@ -65,6 +65,7 @@ jobs:
    runs-on: ubuntu-latest
    # BURN-IN: continue-on-error prevents AND-composition from blocking
    # PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    permissions:
      contents: read
@ -89,6 +90,7 @@ jobs:
        # runners). The sop-tier-check script has its own fallback as a
        # third line of defense. continue-on-error: true ensures this step
        # failing does not block the job.
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        run: |
          # apt-get is the primary method — Ubuntu package mirrors are reliably
@ -109,6 +111,7 @@ jobs:
        # continue-on-error: true at step level — job-level is ignored by Gitea
        # Actions (quirk #10, internal runbooks). Belt-and-suspenders with
        # SOP_FAIL_OPEN=1 + || true below.
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
--- a/.gitea/workflows/staging-verify.yml
+++ b/.gitea/workflows/staging-verify.yml
@ -85,6 +85,7 @@ jobs:
  staging-smoke:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      sha: ${{ steps.compute.outputs.sha }}
@ -205,6 +206,7 @@ jobs:
    if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    env:
      SHA: ${{ needs.staging-smoke.outputs.sha }}
--- a/.gitea/workflows/sweep-aws-secrets.yml
+++ b/.gitea/workflows/sweep-aws-secrets.yml
@ -29,15 +29,11 @@ name: Sweep stale AWS Secrets Manager secrets
 #     reconciler enumerator) is filed as a separate controlplane
 #     issue. This sweeper is the immediate cost-relief stopgap.
 #
-# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same
-# credentials used by the rest of the platform. The dedicated
-# AWS_JANITOR_* naming (which the original GitHub workflow used) was
-# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have
-# secretsmanager:ListSecrets (the production molecule-cp principal);
-# if ListSecrets is revoked in future, a dedicated janitor principal
-# would need to be created and the Gitea secret names updated here.
+# AWS credentials: use the dedicated Secrets Manager janitor principal.
+# Do not fall back to the molecule-cp application principal: it does
+# not need account-wide ListSecrets, and a 2026-05-12 CI failure proved
+# that using it here turns a least-privilege production credential into
+# a red scheduled janitor.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
 # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
@ -65,6 +61,7 @@ jobs:
    name: Sweep AWS Secrets Manager
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 30 min cap, mirroring the other janitors. AWS DeleteSecret is
    # fast (~0.3s/call) so even a 100+ backlog drains in seconds
@ -73,8 +70,8 @@ jobs:
    timeout-minutes: 30
    env:
      AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_SECRETS_JANITOR_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRETS_JANITOR_SECRET_ACCESS_KEY }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
--- a/.gitea/workflows/sweep-cf-orphans.yml
+++ b/.gitea/workflows/sweep-cf-orphans.yml
@ -71,6 +71,7 @@ jobs:
    name: Sweep CF orphans
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
    # within one cron interval instead of burning a full tick. Realistic
--- a/.gitea/workflows/sweep-cf-tunnels.yml
+++ b/.gitea/workflows/sweep-cf-tunnels.yml
@ -55,6 +55,7 @@ jobs:
    name: Sweep CF tunnels
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 30 min cap. Was 5 min on the theory that the only thing that
    # could take >5min is a CF-API hang — but on 2026-05-02 a backlog
--- a/.gitea/workflows/test-ops-scripts.yml
+++ b/.gitea/workflows/test-ops-scripts.yml
@ -46,6 +46,7 @@ jobs:
    name: Ops scripts (unittest)
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/weekly-platform-go.yml
+++ b/.gitea/workflows/weekly-platform-go.yml
@ -31,6 +31,7 @@ jobs:
    name: Weekly Platform-Go Surface
    runs-on: ubuntu-latest
    # continue-on-error: surface only, never block
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    defaults:
      run:
--- a/scripts/ops/sweep-aws-secrets.sh
+++ b/scripts/ops/sweep-aws-secrets.sh
@ -239,9 +239,9 @@ for s in d.get("SecretList", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
-TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c "
+TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
 print(n)
@ -256,7 +256,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes + keep-categories worth seeing
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 delete_c = collections.Counter()
 keep_c = collections.Counter()
@ -291,7 +291,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets."
  log ""
  log "First 20 secrets that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@ -327,7 +327,7 @@ RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX)
 # Build delete plan (one ARN per line) and id→name side-channel for
 # failure-log readability. Use ARN rather than Name on the delete
 # call because Name is mutable; ARN is the stable identifier.
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]
--- a/scripts/ops/sweep-cf-tunnels.sh
+++ b/scripts/ops/sweep-cf-tunnels.sh
@ -195,9 +195,9 @@ for t in d.get("result", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT))
-TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c "
+TENANT_TUNNELS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel')
 print(n)
@ -212,7 +212,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 c = collections.Counter()
 for l in sys.stdin:
@ -242,7 +242,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels."
  log ""
  log "First 20 tunnels that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@ -283,7 +283,7 @@ RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX)

 # Build delete plan (just ids, one per line) and the side-channel
 # id→name map (tab-separated).
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, os, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]