fix(ci): repair scheduled main janitors and remove masks

2026-05-12 16:10:53 -07:00 · 2026-05-12 16:10:53 -07:00 · ccd3d7c072
commit ccd3d7c072
parent 760e4eb806
38 changed files with 145 additions and 142 deletions
--- a/.gitea/workflows/block-internal-paths.yml
+++ b/.gitea/workflows/block-internal-paths.yml
@ -8,7 +8,7 @@ name: Block internal-flavored paths
 #     merge queue; no `gh-readonly-queue/...` refs).
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on the job (RFC §1 contract — surface
+#   - `continue-on-error: false` on the job (RFC §1 contract — surface
 #     defects without blocking; follow-up PR flips after triage).
 #
 # Hard CI gate. Internal content (positioning, competitive briefs, sales
@ -37,7 +37,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
--- a/.gitea/workflows/cascade-list-drift-gate.yml
+++ b/.gitea/workflows/cascade-list-drift-gate.yml
@ -12,7 +12,7 @@ name: cascade-list-drift-gate
 #     will not exist post-Cat-A).
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on the job (RFC §1 contract — surface
+#   - `continue-on-error: false` on the job (RFC §1 contract — surface
 #     defects without blocking; follow-up PR flips after triage).
 #
 # Structural gate: TEMPLATES list in publish-runtime.yml must match
@ -48,7 +48,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
      - name: Check cascade list matches manifest
--- a/.gitea/workflows/check-migration-collisions.yml
+++ b/.gitea/workflows/check-migration-collisions.yml
@ -9,7 +9,7 @@ name: Check migration collisions
 #   - Workflow-level env.GITHUB_SERVER_URL pinned to https://git.moleculesai.app
 #     so scripts/ops/check_migration_collisions.py can derive the Gitea API
 #     base (the script already supports this; see _gitea_api_url()).
-#   - `continue-on-error: true` on the job (RFC §1 contract).
+#   - `continue-on-error: false` on the job (RFC §1 contract).
 #
 # Hard gate (#2341): fails a PR that adds a migration prefix already
 # claimed by the base branch or another open PR. Caught manually 2026-04-30
@ -45,7 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@ -1,5 +1,5 @@
 # Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1.
-# continue-on-error: true on every job; follow-up PR will flip required after
+# continue-on-error: false on every job; follow-up PR will flip required after
 # surfaced bugs are fixed (per RFC §1 — "surface broken workflows without
 # blocking"). The four-surface migration audit
 # (feedback_gitea_actions_migration_audit_pattern) was performed against this
@ -74,7 +74,7 @@ jobs:
    # Flip confirmed 2026-05-12 via combined-status check of latest main
    # commit (all CI jobs green). `all-required` sentinel hard-fails
    # when this job fails; no Phase 3 suppression needed.
-    # revert: add `continue-on-error: true` back if regressions appear.
+    # revert: add `continue-on-error: false` back if regressions appear.
    continue-on-error: false
    outputs:
      platform: ${{ steps.check.outputs.platform }}
@ -128,7 +128,7 @@ jobs:
    runs-on: ubuntu-latest
    # mc#664 (interim): re-mask platform-build pending fix-forward. Phase 4
    # (#656) flipped this to continue-on-error: false based on a Phase-3-masked
-    # "green on main 2026-05-12" — the prior continue-on-error: true had
+    # "green on main 2026-05-12" — the prior continue-on-error: false had
    # been hiding failing tests in workspace-server/internal/handlers/.
    # Two distinct failure classes surfaced on 0e5152c3:
    #   (1) 4x delegation_test.go (lines 1110/1176/1228/1271): helpers
@ -148,7 +148,7 @@ jobs:
    # a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing.
    # Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint)
    # retain continue-on-error: false; only platform-build regresses.
-    continue-on-error: true  # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709)
+    continue-on-error: false  # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709)
    defaults:
      run:
        working-directory: workspace-server
@ -186,7 +186,7 @@ jobs:
          echo "::group::pendinguploads exit=$pu_exit (last 100 lines)"
          tail -100 /tmp/test-pu.log
          echo "::endgroup::"
-        continue-on-error: true
+        continue-on-error: false
      - if: needs.changes.outputs.platform == 'true'
        name: Run tests with race detection and coverage
        run: go test -race -coverprofile=coverage.out ./...
@ -372,7 +372,7 @@ jobs:
  canvas-deploy-reminder:
    name: Canvas Deploy Reminder
    runs-on: ubuntu-latest
-    continue-on-error: true
+    continue-on-error: false
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
    if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
@ -536,14 +536,14 @@ jobs:
    # `.gitea/scripts/ci-required-drift.py::ci_job_names`).
    #
    # Phase 3 (RFC #219 §1) safety: underlying build jobs carry
-    # continue-on-error: true so their failures are masked to null (2026-05-12: re-enabled mc#664 interim)
+    # continue-on-error: false so their failures are masked to null (2026-05-12: re-enabled mc#664 interim)
    # (Gitea suppresses status reporting for CoE jobs). This sentinel
    # runs with continue-on-error: false so it always reports its
    # result to the API — without this, the required-status entry
    # (CI / all-required (pull_request)) is never created, which
    # blocks PR merges. When Phase 3 ends, flip underlying jobs to
    # continue-on-error: false; this sentinel can then be flipped to
-    # continue-on-error: true if a Phase-4 regression requires it.
+    # continue-on-error: false if a Phase-4 regression requires it.
    continue-on-error: false
    runs-on: ubuntu-latest
    timeout-minutes: 1
@ -560,7 +560,7 @@ jobs:
          set -euo pipefail
          # `needs.*.result` is one of: success | failure | cancelled | skipped | null.
          # We assert success per dep (not != failure) — see RFC §2 reasoning above.
-          # Null results are skipped: they come from Phase 3 (continue-on-error: true
+          # Null results are skipped: they come from Phase 3 (continue-on-error: false
          # suppresses status) or from jobs still in-flight. The sentinel succeeds
          # rather than blocking PRs on Phase 3 noise.
          results='${{ toJSON(needs) }}'
@ -568,7 +568,7 @@ jobs:
          echo "$results" | python3 -c '
          import json, sys
          ns = json.load(sys.stdin)
-          # Phase 3 masked: jobs with continue-on-error: true may report "failure"
+          # Phase 3 masked: jobs with continue-on-error: false may report "failure"
          # Remove when mc#664 handler test failures are resolved.
          PHASE3_MASKED = {"platform-build"}
          # Exclude null (Phase 3 suppressed / in-flight) from the bad list.
--- a/.gitea/workflows/continuous-synth-e2e.yml
+++ b/.gitea/workflows/continuous-synth-e2e.yml
@ -8,7 +8,7 @@ name: Continuous synthetic E2E (staging)
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Hard gate (#2342): cron-driven full-lifecycle E2E that catches
@ -90,7 +90,7 @@ jobs:
    name: Synthetic E2E against staging
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
    # (apt-get update + install docker.io/jq/awscli/caddy + snap install
    # ssm-agent) runs from raw Ubuntu on every boot — none of it is
--- a/.gitea/workflows/e2e-api.yml
+++ b/.gitea/workflows/e2e-api.yml
@ -8,7 +8,7 @@ name: E2E API Smoke Test
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #
 # Extracted from ci.yml so workflow-level concurrency can protect this job
 # from run-level cancellation (issue #458).
@ -103,7 +103,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    outputs:
      api: ${{ steps.decide.outputs.api }}
    steps:
@ -154,7 +154,7 @@ jobs:
    name: E2E API Smoke Test
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 15
    env:
      # Unique per-run container names so concurrent runs on the host-
--- a/.gitea/workflows/e2e-staging-canvas.yml
+++ b/.gitea/workflows/e2e-staging-canvas.yml
@ -8,7 +8,7 @@ name: E2E Staging Canvas (Playwright)
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Playwright test suite that provisions a fresh staging org per run and
@ -70,7 +70,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    outputs:
      canvas: ${{ steps.decide.outputs.canvas }}
    steps:
@ -118,7 +118,7 @@ jobs:
    name: Canvas tabs E2E
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 40

    env:
--- a/.gitea/workflows/e2e-staging-external.yml
+++ b/.gitea/workflows/e2e-staging-external.yml
@ -8,7 +8,7 @@ name: E2E Staging External Runtime
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Regression for the four/five workspaces.status=awaiting_agent transitions
@ -84,7 +84,7 @@ jobs:
    name: E2E Staging External Runtime
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 25

    env:
--- a/.gitea/workflows/e2e-staging-saas.yml
+++ b/.gitea/workflows/e2e-staging-saas.yml
@ -8,7 +8,7 @@ name: E2E Staging SaaS (full lifecycle)
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Dedicated workflow that provisions a fresh staging org per run, exercises
@ -81,25 +81,25 @@ jobs:
  # PR-validation path: always posts success so branch protection can merge
  # workflow-only PRs. The actual E2E step only runs when provisioning-
  # critical files change (git-paths filter + if: guard below).
-  # All steps use continue-on-error: true so runner issues do not block merge.
+  # All steps use continue-on-error: false so runner issues do not block merge.
  pr-validate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 1
-        continue-on-error: true
+        continue-on-error: false

      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
-        continue-on-error: true
+        continue-on-error: false

      - name: YAML validation (best-effort)
        run: |
          echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid."
          echo "E2E step runs only when provisioning-critical files change."
-        continue-on-error: true
+        continue-on-error: false

  # Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only
  # path — pr-validate above posts success for workflow-only PRs.
@ -109,7 +109,7 @@ jobs:
    # Only runs on trunk pushes. PR paths get pr-validate instead.
    if: github.event.pull_request.base.ref == ''
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 45
    permissions:
      contents: read
--- a/.gitea/workflows/e2e-staging-sanity.yml
+++ b/.gitea/workflows/e2e-staging-sanity.yml
@ -8,7 +8,7 @@ name: E2E Staging Sanity (leak-detection self-check)
 #   - `actions/github-script@v9` issue-open block replaced with curl
 #     calls to the Gitea REST API (/api/v1/repos/.../issues|comments).
 #   - Workflow-level env.GITHUB_SERVER_URL set.
-#   - `continue-on-error: true` on the job (RFC §1 contract).
+#   - `continue-on-error: false` on the job (RFC §1 contract).
 #
 # Periodic assertion that the teardown safety nets in e2e-staging-saas
 # and staging-smoke (formerly canary-staging) actually work. Runs the
@ -37,7 +37,7 @@ jobs:
    name: Intentional-failure teardown sanity
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 20

    env:
--- a/.gitea/workflows/gate-check-v3.yml
+++ b/.gitea/workflows/gate-check-v3.yml
@ -46,7 +46,7 @@ env:
 jobs:
  gate-check:
    runs-on: ubuntu-latest
-    continue-on-error: true  # Never block on our own detector failing
+    continue-on-error: false  # Never block on our own detector failing
    steps:
      - name: Check out BASE ref (never PR-head under pull_request_target)
        # pull_request_target runs with repo secrets-context, so checking out
@ -76,25 +76,32 @@ jobs:
        if: github.event_name == 'schedule'
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
        run: |
          set -euo pipefail
          # Fetch all open PRs and run gate-check on each
          # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN.
          # gate_check.py uses timeout=15 on every urlopen call; this catches the
          # inline Python polling loop too (issue #603).
-          pr_numbers=$(python3 -c "
-            import socket, urllib.request, json, os
-            socket.setdefaulttimeout(15)
-            token = os.environ['GITEA_TOKEN']
-            req = urllib.request.Request(
-                'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
-                headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
-            )
-            with urllib.request.urlopen(req) as r:
-                prs = json.loads(r.read())
-            for pr in prs:
-                print(pr['number'])
-          ")
+          pr_numbers=$(python3 <<'PY'
+          import json
+          import os
+          import socket
+          import urllib.request
+
+          socket.setdefaulttimeout(15)
+          token = os.environ["GITEA_TOKEN"]
+          repo = os.environ["REPO"]
+          req = urllib.request.Request(
+              f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100",
+              headers={"Authorization": f"token {token}", "Accept": "application/json"},
+          )
+          with urllib.request.urlopen(req) as r:
+              prs = json.loads(r.read())
+          for pr in prs:
+              print(pr["number"])
+          PY
+          )
          for pr in $pr_numbers; do
            echo "Checking PR #$pr..."
            python3 tools/gate-check-v3/gate_check.py \
--- a/.gitea/workflows/handlers-postgres-integration.yml
+++ b/.gitea/workflows/handlers-postgres-integration.yml
@ -8,7 +8,7 @@ name: Handlers Postgres Integration
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Real-Postgres integration tests for workspace-server/internal/handlers/.
@ -79,7 +79,7 @@ jobs:
    name: detect-changes
    runs-on: ubuntu-latest
    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    outputs:
      handlers: ${{ steps.filter.outputs.handlers }}
    steps:
@ -119,7 +119,7 @@ jobs:
    needs: detect-changes
    runs-on: ubuntu-latest
    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    env:
      # Unique name per run so concurrent jobs don't collide on the
      # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
--- a/.gitea/workflows/harness-replays.yml
+++ b/.gitea/workflows/harness-replays.yml
@ -8,7 +8,7 @@ name: Harness Replays
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Boots tests/harness (production-shape compose topology with TenantGuard,
@ -63,7 +63,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    outputs:
      run: ${{ steps.decide.outputs.run }}
    steps:
@ -154,7 +154,7 @@ jobs:
    name: Harness Replays
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 30
    steps:
      - name: No-op pass (paths filter excluded this commit)
--- a/.gitea/workflows/lint-continue-on-error-tracking.yml
+++ b/.gitea/workflows/lint-continue-on-error-tracking.yml
@ -1,17 +1,17 @@
 name: lint-continue-on-error-tracking

 # Tier 2e hard-gate lint (per internal#350) — every
-# `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a
+# `continue-on-error: false` in `.gitea/workflows/*.yml` must carry a
 # `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines,
 # the referenced issue must be OPEN, and ≤14 days old.
 #
 # Why this exists
 # ---------------
-# `continue-on-error: true` on `platform-build` had been hiding
+# `continue-on-error: false` on `platform-build` had been hiding
 # mc#664-class regressions for ~3 weeks before #656 surfaced them on
 # 2026-05-12. A 14-day cap on tracker age forces a review cycle and
 # surfaces mask-drift within at most 14 days of the original defect.
-# Each `continue-on-error: true` gets a paper trail — close or renew.
+# Each `continue-on-error: false` gets a paper trail — close or renew.
 #
 # How the gate works
 # ------------------
@ -37,8 +37,8 @@ name: lint-continue-on-error-tracking
 #
 # Phase contract (RFC internal#219 §1 ladder)
 # -------------------------------------------
-# Lands at `continue-on-error: true` (Phase 3 — surface broken shapes
-# without blocking). The pre-existing `continue-on-error: true`
+# Lands at `continue-on-error: false` (Phase 3 — surface broken shapes
+# without blocking). The pre-existing `continue-on-error: false`
 # directives on `main` will all violate this lint at first
 # (intentional — they're the masked defects this lint exists to
 # surface). Each must be triaged: file a fresh tracker comment,
@ -94,10 +94,10 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 10
    # Phase 3 (RFC #219 §1): surface masked defects without blocking
-    # PRs. Pre-existing continue-on-error: true directives on main
+    # PRs. Pre-existing continue-on-error: false directives on main
    # all violate this lint at first — intentional. Flip to false
    # follow-up after main is clean for 3 days. internal#350.
-    continue-on-error: true  # internal#350 Phase 3 mask — 14d forced-renewal cadence
+    continue-on-error: false  # internal#350 Phase 3 mask — 14d forced-renewal cadence
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
--- a/.gitea/workflows/lint-curl-status-capture.yml
+++ b/.gitea/workflows/lint-curl-status-capture.yml
@ -11,7 +11,7 @@ name: Lint curl status-code capture
 #   - Dropped `merge_group:` trigger.
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on the job (RFC §1 contract).
+#   - `continue-on-error: false` on the job (RFC §1 contract).
 #
 # Pins the workflow-bash anti-pattern that produced "HTTP 000000" on the
 # 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6:
@ -45,7 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Find curl ... -w '%{http_code}' ... || echo "000" subshells
--- a/.gitea/workflows/lint-mask-pr-atomicity.yml
+++ b/.gitea/workflows/lint-mask-pr-atomicity.yml
@ -7,7 +7,7 @@ name: lint-mask-pr-atomicity
 #
 # Why this exists
 # ---------------
-# PR#665 (interim `continue-on-error: true` on `platform-build`) and
+# PR#665 (interim `continue-on-error: false` on `platform-build`) and
 # PR#668 (sentinel-`needs` demotion of the same job) were designed as a
 # pair but merged solo — #665 landed at 04:47Z 2026-05-12, #668 was
 # still open at 05:07Z when the main-red watchdog (#674) fired. Result:
@ -34,7 +34,7 @@ name: lint-mask-pr-atomicity
 #
 # Phase contract (RFC internal#219 §1 ladder)
 # -------------------------------------------
-# This workflow lands at `continue-on-error: true` (Phase 3 — surface
+# This workflow lands at `continue-on-error: false` (Phase 3 — surface
 # regressions without blocking PRs while the rule beds in).
 # Follow-up PR flips to `false` once we have ≥3 days of clean runs on
 # `main` and no false-positives. Tracking issue: internal#350.
@ -92,7 +92,7 @@ jobs:
    # PRs. Follow-up PR flips this to `false` once recent runs on main
    # are confirmed clean (eat-our-own-dogfood discipline mirrors
    # PR#673's same-shape comment). Tracking: internal#350.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - name: Check out PR head with full history (need base SHA blobs)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
--- a/.gitea/workflows/lint-pre-flip-continue-on-error.yml
+++ b/.gitea/workflows/lint-pre-flip-continue-on-error.yml
@ -1,15 +1,15 @@
 name: Lint pre-flip continue-on-error

-# Pre-merge gate: blocks PRs that flip `continue-on-error: true → false`
+# Pre-merge gate: blocks PRs that flip `continue-on-error: false → false`
 # on any job in `.gitea/workflows/*.yml` WITHOUT proof that the affected
 # job's recent runs on the target branch (PR base) are actually green.
 #
 # Empirical class: PR #656 / mc#664. PR #656 (RFC internal#219 Phase 4)
-# flipped 5 platform-build-class jobs `continue-on-error: true → false`
+# flipped 5 platform-build-class jobs `continue-on-error: false → false`
 # on the basis of a "verified green on main via combined-status check".
-# But that "green" was the LIE the prior `continue-on-error: true`
+# But that "green" was the LIE the prior `continue-on-error: false`
 # produced: Gitea Quirk #10 (internal#342 + dup #287) — a failed step
-# inside a `continue-on-error: true` job rolls up to a `success`
+# inside a `continue-on-error: false` job rolls up to a `success`
 # job-level status. The precondition the PR claimed to verify was
 # structurally fooled by the bug being flipped.
 #
@ -61,7 +61,7 @@ name: Lint pre-flip continue-on-error
 # feedback_no_shared_persona_token_use.
 #
 # Phase contract (RFC internal#219 §1 ladder):
-#   - This workflow lands at `continue-on-error: true` (Phase 3 —
+#   - This workflow lands at `continue-on-error: false` (Phase 3 —
 #     surface defects without blocking). Follow-up PR flips it to
 #     `false` ONLY after this workflow's own recent runs on `main`
 #     are confirmed clean — exactly the discipline the workflow
@ -100,7 +100,7 @@ jobs:
    # Phase 3 (RFC internal#219 §1): surface broken flips without blocking
    # the PR yet. Follow-up flips this to `false` once the workflow itself
    # has clean recent runs on main. mc#664 interim — remove when CoE→false.
-    continue-on-error: true  # mc#664
+    continue-on-error: false  # mc#664
    steps:
      - name: Check out PR head (full history for base-SHA access)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
--- a/.gitea/workflows/lint-workflow-yaml.yml
+++ b/.gitea/workflows/lint-workflow-yaml.yml
@ -25,7 +25,7 @@ name: Lint workflow YAML (Gitea-1.22.6-hostile shapes)
 #   - pull_request: pre-merge gate — block hostile shapes before they land
 #   - push: post-merge regression detection — catch direct-to-main edits
 #
-# Per RFC internal#219 §1 contract: continue-on-error: true during the
+# Per RFC internal#219 §1 contract: continue-on-error: false during the
 # surface-broken-shapes phase. Follow-up PR flips off after surfaced
 # defects are triaged. The push-trigger ensures we catch regressions
 # even if the pull_request gate is bypassed by branch-protection drift.
@ -55,7 +55,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs.
    # Follow-up PR flips this off after the 4 existing-on-main rule-2
    # (workflow_run) violations are migrated to a supported trigger.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

--- a/.gitea/workflows/publish-canvas-image.yml
+++ b/.gitea/workflows/publish-canvas-image.yml
@ -8,7 +8,7 @@ name: publish-canvas-image
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #   - **Open question for review**: this workflow pushes the canvas
 #     image to `ghcr.io`. GHCR was retired during the 2026-05-06
 #     Gitea migration in favor of ECR (per staging-verify.yml header
@ -62,7 +62,7 @@ jobs:
    # See issue #576 + infra-lead pulse ~00:30Z.
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/publish-runtime-autobump.yml
+++ b/.gitea/workflows/publish-runtime-autobump.yml
@ -24,7 +24,7 @@ name: publish-runtime-autobump

 on:
  # Run on PR pushes to post a success status so Gitea can merge the PR.
-  # All steps use continue-on-error: true so operational failures
+  # All steps use continue-on-error: false so operational failures
  # (PyPI unreachable, DISPATCH_TOKEN missing) do not block merge.
  pull_request:
    paths:
@ -51,11 +51,11 @@ concurrency:
 jobs:
  # PR-validation path: always succeeds so Gitea can merge workflow-only PRs.
  # Operational failures (PyPI unreachable, missing DISPATCH_TOKEN) are
-  # surfaced via continue-on-error: true rather than blocking the merge.
+  # surfaced via continue-on-error: false rather than blocking the merge.
  # The actual bump work happens on the main/staging push after merge.
  pr-validate:
    runs-on: ubuntu-latest
-    continue-on-error: true  # do not block PR merge on operational failures
+    continue-on-error: false  # do not block PR merge on operational failures
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
--- a/.gitea/workflows/railway-pin-audit.yml
+++ b/.gitea/workflows/railway-pin-audit.yml
@ -15,7 +15,7 @@ name: Railway pin audit (drift detection)
 #   - Workflow-level env.GITHUB_SERVER_URL set so the curl calls can
 #     derive `git.moleculesai.app` from the runner env (with
 #     hard-coded fallback inside the steps).
-#   - `continue-on-error: true` on the job (RFC §1 contract).
+#   - `continue-on-error: false` on the job (RFC §1 contract).
 #
 # Daily audit of Railway env vars for drift-prone image-tag pins —
 # automation-cadence layer over the detection script + regression test
@ -51,7 +51,7 @@ jobs:
    name: Audit Railway env vars for drift-prone pins
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 10

    steps:
--- a/.gitea/workflows/redeploy-tenants-on-main.yml
+++ b/.gitea/workflows/redeploy-tenants-on-main.yml
@ -8,7 +8,7 @@ name: redeploy-tenants-on-main
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #   - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with
 #     push+paths filter per this PR. Gitea 1.22.6 does not support
 #     `workflow_run` (task #81). The push trigger fires on every
@ -86,7 +86,7 @@ jobs:
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 25
    steps:
      - name: Note on ECR propagation
--- a/.gitea/workflows/redeploy-tenants-on-staging.yml
+++ b/.gitea/workflows/redeploy-tenants-on-staging.yml
@ -8,7 +8,7 @@ name: redeploy-tenants-on-staging
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #   - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with
 #     push+paths filter per this PR. Gitea 1.22.6 does not support
 #     `workflow_run` (task #81). The push trigger fires on every
@ -76,7 +76,7 @@ jobs:
  redeploy:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 25
    steps:
      - name: Wait for GHCR tag propagation
--- a/.gitea/workflows/review-check-tests.yml
+++ b/.gitea/workflows/review-check-tests.yml
@ -53,7 +53,7 @@ jobs:
        # runners with internet access to package mirrors). Falls back to GitHub
        # binary download. GitHub releases may be blocked on some runner networks
        # (infra#241 follow-up).
-        continue-on-error: true
+        continue-on-error: false
        run: |
          if apt-get update -qq && apt-get install -y -qq jq; then
            echo "::notice::jq installed via apt-get: $(jq --version)"
--- a/.gitea/workflows/runtime-pin-compat.yml
+++ b/.gitea/workflows/runtime-pin-compat.yml
@ -12,7 +12,7 @@ name: Runtime Pin Compatibility
 #   - on.paths references .gitea/workflows/runtime-pin-compat.yml (this
 #     file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
-#   - `continue-on-error: true` on the job (RFC §1 contract).
+#   - `continue-on-error: false` on the job (RFC §1 contract).
 #
 # CI gate that prevents the 5-hour staging outage from 2026-04-24 from
 # recurring (controlplane#253). The original failure mode:
@ -67,7 +67,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
--- a/.gitea/workflows/runtime-prbuild-compat.yml
+++ b/.gitea/workflows/runtime-prbuild-compat.yml
@ -11,7 +11,7 @@ name: Runtime PR-Built Compatibility
 #     pattern for ci.yml port).
 #   - on.paths references .gitea/workflows/runtime-prbuild-compat.yml.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
-#   - `continue-on-error: true` on every job (RFC §1 contract).
+#   - `continue-on-error: false` on every job (RFC §1 contract).
 #
 # Companion to `runtime-pin-compat.yml`. That workflow tests what's
 # CURRENTLY PUBLISHED on PyPI; this workflow tests what WOULD BE
@ -52,7 +52,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    outputs:
      wheel: ${{ steps.decide.outputs.wheel }}
    steps:
@ -96,7 +96,7 @@ jobs:
    name: PR-built wheel + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.wheel != 'true'
--- a/.gitea/workflows/secret-pattern-drift.yml
+++ b/.gitea/workflows/secret-pattern-drift.yml
@ -9,7 +9,7 @@ name: SECRET_PATTERNS drift lint
 #   - CANONICAL_FILE inside scripts/lint_secret_pattern_drift.py was
 #     updated in the same Cat C-1 PR to point at .gitea/workflows/secret-scan.yml.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
-#   - `continue-on-error: true` on the job (RFC §1 contract).
+#   - `continue-on-error: false` on the job (RFC §1 contract).
 #
 # Detects when the canonical SECRET_PATTERNS array in
 # .gitea/workflows/secret-scan.yml diverges from known consumer
@ -57,7 +57,7 @@ jobs:
    name: Detect SECRET_PATTERNS drift
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/sop-tier-check.yml
+++ b/.gitea/workflows/sop-tier-check.yml
@ -32,10 +32,10 @@
 #                           for PRs in-flight when AND-composition deployed.
 #                           Burn-in: remove after 2026-05-17 (7-day window).
 #
-# BURN-IN NOTE (internal#189 Phase 1): continue-on-error: true is set on
+# BURN-IN NOTE (internal#189 Phase 1): continue-on-error: false is set on
 # the tier-check job below. This prevents AND-composition from blocking
 # PRs during the 7-day burn-in. After 2026-05-17:
-#   1. Remove `continue-on-error: true` from this job block.
+#   1. Remove `continue-on-error: false` from this job block.
 #   2. Update this BURN-IN NOTE comment to mark the window closed.

 name: sop-tier-check
@ -65,7 +65,7 @@ jobs:
    runs-on: ubuntu-latest
    # BURN-IN: continue-on-error prevents AND-composition from blocking
    # PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
-    continue-on-error: true
+    continue-on-error: false
    permissions:
      contents: read
      pull-requests: read
@ -87,9 +87,9 @@ jobs:
        # GitHub releases may be unreachable from some runner networks
        # (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
        # runners). The sop-tier-check script has its own fallback as a
-        # third line of defense. continue-on-error: true ensures this step
+        # third line of defense. continue-on-error: false ensures this step
        # failing does not block the job.
-        continue-on-error: true
+        continue-on-error: false
        run: |
          # apt-get is the primary method — Ubuntu package mirrors are reliably
          # reachable from runner containers. GitHub releases may be blocked
@ -106,10 +106,10 @@ jobs:
          jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"

      - name: Verify tier label + reviewer team membership
-        # continue-on-error: true at step level — job-level is ignored by Gitea
+        # continue-on-error: false at step level — job-level is ignored by Gitea
        # Actions (quirk #10, internal runbooks). Belt-and-suspenders with
        # SOP_FAIL_OPEN=1 + || true below.
-        continue-on-error: true
+        continue-on-error: false
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
@ -119,7 +119,7 @@ jobs:
          SOP_DEBUG: '0'
          SOP_LEGACY_CHECK: '0'
          # SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
-          # the actual merge gate. Combined with continue-on-error: true
+          # the actual merge gate. Combined with continue-on-error: false
          # above, this step never fails the job regardless of script exit.
          SOP_FAIL_OPEN: '1'
        run: |
--- a/.gitea/workflows/staging-smoke.yml
+++ b/.gitea/workflows/staging-smoke.yml
@ -10,7 +10,7 @@ name: Staging SaaS smoke (every 30 min)
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Minimum viable health check: provisions one Hermes workspace on a fresh
@ -52,7 +52,7 @@ jobs:
  smoke:
    name: Staging SaaS smoke
    runs-on: ubuntu-latest
-    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
+    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: false` removed
    # 2026-05-11. The "surface broken workflows without blocking"
    # rationale was correctly applied to advisory/lint workflows but
    # wrong for this smoke — it is the 30-min canary cadence for the
@ -61,7 +61,7 @@ jobs:
    # drift, WorkOS session breakage, secret rotations). Same class of
    # failure as PR#461 (`sweep-stale-e2e-orgs`) where Phase-3 silent
    # failure leaked EC2. The four other `e2e-staging-*` workflows
-    # KEEP `continue-on-error: true` per RFC #219 §1 — they are
+    # KEEP `continue-on-error: false` per RFC #219 §1 — they are
    # advisory and matrix-style; this one is the canary. A follow-up
    # `notify-failure` step below also surfaces breakage to ops even
    # if branch-protection wiring is adjusted to keep this off the
@ -333,7 +333,7 @@ jobs:
          exit 0

      - name: Notify on smoke failure
-        # Fail-loud companion to dropping `continue-on-error: true`.
+        # Fail-loud companion to dropping `continue-on-error: false`.
        # The Open-issue-on-failure step above handles the human-facing
        # alert; this step emits a clearly-tagged ::error:: line that
        # log-tail consumers (Loki SOPRefireRule, orchestrator triage
--- a/.gitea/workflows/staging-verify.yml
+++ b/.gitea/workflows/staging-verify.yml
@ -10,7 +10,7 @@ name: Staging verify
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #   - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with
 #     push+paths filter per this PR. Gitea 1.22.6 does not support
 #     `workflow_run` (task #81). The push trigger fires on every
@ -85,7 +85,7 @@ jobs:
  staging-smoke:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    outputs:
      sha: ${{ steps.compute.outputs.sha }}
      smoke_ran: ${{ steps.smoke.outputs.ran }}
@ -205,7 +205,7 @@ jobs:
    if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    env:
      SHA: ${{ needs.staging-smoke.outputs.sha }}
      CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
--- a/.gitea/workflows/sweep-aws-secrets.yml
+++ b/.gitea/workflows/sweep-aws-secrets.yml
@ -8,7 +8,7 @@ name: Sweep stale AWS Secrets Manager secrets
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Janitor for per-tenant AWS Secrets Manager secrets
@ -29,15 +29,11 @@ name: Sweep stale AWS Secrets Manager secrets
 #     reconciler enumerator) is filed as a separate controlplane
 #     issue. This sweeper is the immediate cost-relief stopgap.
 #
-# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same
-# credentials used by the rest of the platform. The dedicated
-# AWS_JANITOR_* naming (which the original GitHub workflow used) was
-# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have
-# secretsmanager:ListSecrets (the production molecule-cp principal);
-# if ListSecrets is revoked in future, a dedicated janitor principal
-# would need to be created and the Gitea secret names updated here.
+# AWS credentials: use the dedicated Secrets Manager janitor principal.
+# Do not fall back to the molecule-cp application principal: it does
+# not need account-wide ListSecrets, and a 2026-05-12 CI failure proved
+# that using it here turns a least-privilege production credential into
+# a red scheduled janitor.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
 # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
@ -65,7 +61,7 @@ jobs:
    name: Sweep AWS Secrets Manager
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    # 30 min cap, mirroring the other janitors. AWS DeleteSecret is
    # fast (~0.3s/call) so even a 100+ backlog drains in seconds
    # under the 8-way xargs parallelism, but the cap is set generously
@ -73,8 +69,8 @@ jobs:
    timeout-minutes: 30
    env:
      AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_SECRETS_JANITOR_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRETS_JANITOR_SECRET_ACCESS_KEY }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
--- a/.gitea/workflows/sweep-cf-orphans.yml
+++ b/.gitea/workflows/sweep-cf-orphans.yml
@ -8,7 +8,7 @@ name: Sweep stale Cloudflare DNS records
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Janitor for Cloudflare DNS records whose backing tenant/workspace no
@ -71,7 +71,7 @@ jobs:
    name: Sweep CF orphans
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
    # within one cron interval instead of burning a full tick. Realistic
    # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
--- a/.gitea/workflows/sweep-cf-tunnels.yml
+++ b/.gitea/workflows/sweep-cf-tunnels.yml
@ -8,7 +8,7 @@ name: Sweep stale Cloudflare Tunnels
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Janitor for Cloudflare Tunnels whose backing tenant no longer
@ -55,7 +55,7 @@ jobs:
    name: Sweep CF tunnels
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    # 30 min cap. Was 5 min on the theory that the only thing that
    # could take >5min is a CF-API hang — but on 2026-05-02 a backlog
    # of 672 stale tunnels accumulated (large staging E2E run + delayed
--- a/.gitea/workflows/sweep-stale-e2e-orgs.yml
+++ b/.gitea/workflows/sweep-stale-e2e-orgs.yml
@ -8,7 +8,7 @@ name: Sweep stale e2e-* orgs (staging)
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
-#   - `continue-on-error: true` on each job (RFC §1 contract).
+#   - `continue-on-error: false` on each job (RFC §1 contract).
 #

 # Janitor for staging tenants left behind when E2E cleanup didn't run:
@ -63,7 +63,7 @@ jobs:
  sweep:
    name: Sweep e2e orgs
    runs-on: ubuntu-latest
-    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
+    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: false` removed
    # 2026-05-11. The "surface broken workflows without blocking"
    # rationale was correctly applied to advisory/lint workflows but
    # wrong for this janitor — silent failure here masks real-money
@ -253,7 +253,7 @@ jobs:
          echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete."

      - name: Notify on sweep failure
-        # Fail-loud companion to dropping `continue-on-error: true`.
+        # Fail-loud companion to dropping `continue-on-error: false`.
        # If any prior step failed (missing token, CP 5xx, safety-cap
        # tripped, etc.) emit a clearly-tagged ::error:: line so the
        # Gitea runs UI + any log-tail consumer (Loki SOPRefireRule)
--- a/.gitea/workflows/test-ops-scripts.yml
+++ b/.gitea/workflows/test-ops-scripts.yml
@ -8,7 +8,7 @@ name: Ops Scripts Tests
 #   - on.paths references .gitea/workflows/test-ops-scripts.yml (this
 #     file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
-#   - `continue-on-error: true` on the job (RFC §1 contract).
+#   - `continue-on-error: false` on the job (RFC §1 contract).
 #
 # Runs the unittest suite for scripts/ on every PR + push that touches
 # anything under scripts/. Kept separate from the main CI so a script-only
@ -46,7 +46,7 @@ jobs:
    name: Ops scripts (unittest)
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    continue-on-error: true
+    continue-on-error: false
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
--- a/.gitea/workflows/weekly-platform-go.yml
+++ b/.gitea/workflows/weekly-platform-go.yml
@ -11,7 +11,7 @@ name: Weekly Platform-Go Surface
 #
 # This workflow runs the full suite (build, vet, golangci-lint, tests with
 # coverage) every Monday at 04:17 UTC. Results are posted as commit statuses
-# but continue-on-error: true means they never block anything — they're
+# but continue-on-error: false means they never block anything — they're
 # purely a noise-reduction signal for when the next workspace-server push
 # lands and would otherwise trigger the first real suite run.
 #
@ -31,7 +31,7 @@ jobs:
    name: Weekly Platform-Go Surface
    runs-on: ubuntu-latest
    # continue-on-error: surface only, never block
-    continue-on-error: true
+    continue-on-error: false
    defaults:
      run:
        working-directory: workspace-server
--- a/scripts/ops/sweep-aws-secrets.sh
+++ b/scripts/ops/sweep-aws-secrets.sh
@ -239,9 +239,9 @@ for s in d.get("SecretList", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
-TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c "
+TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
 print(n)
@ -256,7 +256,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes + keep-categories worth seeing
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 delete_c = collections.Counter()
 keep_c = collections.Counter()
@ -291,7 +291,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets."
  log ""
  log "First 20 secrets that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@ -327,7 +327,7 @@ RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX)
 # Build delete plan (one ARN per line) and id→name side-channel for
 # failure-log readability. Use ARN rather than Name on the delete
 # call because Name is mutable; ARN is the stable identifier.
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]
--- a/scripts/ops/sweep-cf-tunnels.sh
+++ b/scripts/ops/sweep-cf-tunnels.sh
@ -195,9 +195,9 @@ for t in d.get("result", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT))
-TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c "
+TENANT_TUNNELS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel')
 print(n)
@ -212,7 +212,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 c = collections.Counter()
 for l in sys.stdin:
@ -242,7 +242,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels."
  log ""
  log "First 20 tunnels that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@ -283,7 +283,7 @@ RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX)

 # Build delete plan (just ids, one per line) and the side-channel
 # id→name map (tab-separated).
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, os, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]