fix(ci): repair scheduled main janitors and track masks

2026-05-12 16:10:53 -07:00 · 2026-05-12 16:10:53 -07:00 · 1aa0f43df8
commit 1aa0f43df8
parent 760e4eb806
37 changed files with 131 additions and 89 deletions
--- a/.gitea/workflows/block-internal-paths.yml
+++ b/.gitea/workflows/block-internal-paths.yml
@ -37,6 +37,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/cascade-list-drift-gate.yml
+++ b/.gitea/workflows/cascade-list-drift-gate.yml
@ -48,6 +48,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
--- a/.gitea/workflows/check-migration-collisions.yml
+++ b/.gitea/workflows/check-migration-collisions.yml
@ -45,6 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 5
    steps:
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@ -148,6 +148,7 @@ jobs:
    # a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing.
    # Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint)
    # retain continue-on-error: false; only platform-build regresses.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709)
    defaults:
      run:
@ -186,6 +187,7 @@ jobs:
          echo "::group::pendinguploads exit=$pu_exit (last 100 lines)"
          tail -100 /tmp/test-pu.log
          echo "::endgroup::"
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
      - if: needs.changes.outputs.platform == 'true'
        name: Run tests with race detection and coverage
@ -372,6 +374,7 @@ jobs:
  canvas-deploy-reminder:
    name: Canvas Deploy Reminder
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
--- a/.gitea/workflows/continuous-synth-e2e.yml
+++ b/.gitea/workflows/continuous-synth-e2e.yml
@ -90,6 +90,7 @@ jobs:
    name: Synthetic E2E against staging
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
    # (apt-get update + install docker.io/jq/awscli/caddy + snap install
--- a/.gitea/workflows/e2e-api.yml
+++ b/.gitea/workflows/e2e-api.yml
@ -103,6 +103,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      api: ${{ steps.decide.outputs.api }}
@ -154,6 +155,7 @@ jobs:
    name: E2E API Smoke Test
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 15
    env:
@ -164,7 +166,6 @@ jobs:
      # we let Docker assign an ephemeral host port.
      PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
-      PORT: "8080"
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.api != 'true'
@ -268,6 +269,20 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: go build -o platform-server ./cmd/server
+      - name: Pick platform port
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          PLATFORM_PORT=$(python3 - <<'PY'
+          import socket
+
+          with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+              s.bind(("127.0.0.1", 0))
+              print(s.getsockname()[1])
+          PY
+          )
+          echo "PORT=${PLATFORM_PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://127.0.0.1:${PLATFORM_PORT}" >> "$GITHUB_ENV"
+          echo "Platform host port: ${PLATFORM_PORT}"
      - name: Start platform (background)
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
@ -280,7 +295,7 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          for i in $(seq 1 30); do
-            if curl -sf http://127.0.0.1:8080/health > /dev/null; then
+            if curl -sf "$BASE/health" > /dev/null; then
              echo "Platform up after ${i}s"
              exit 0
            fi
--- a/.gitea/workflows/e2e-staging-canvas.yml
+++ b/.gitea/workflows/e2e-staging-canvas.yml
@ -70,6 +70,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      canvas: ${{ steps.decide.outputs.canvas }}
@ -118,6 +119,7 @@ jobs:
    name: Canvas tabs E2E
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 40

--- a/.gitea/workflows/e2e-staging-external.yml
+++ b/.gitea/workflows/e2e-staging-external.yml
@ -84,6 +84,7 @@ jobs:
    name: E2E Staging External Runtime
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25

--- a/.gitea/workflows/e2e-staging-saas.yml
+++ b/.gitea/workflows/e2e-staging-saas.yml
@ -88,17 +88,20 @@ jobs:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 1
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

      - name: YAML validation (best-effort)
        run: |
          echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid."
          echo "E2E step runs only when provisioning-critical files change."
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

  # Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only
@ -109,6 +112,7 @@ jobs:
    # Only runs on trunk pushes. PR paths get pr-validate instead.
    if: github.event.pull_request.base.ref == ''
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 45
    permissions:
--- a/.gitea/workflows/e2e-staging-sanity.yml
+++ b/.gitea/workflows/e2e-staging-sanity.yml
@ -37,6 +37,7 @@ jobs:
    name: Intentional-failure teardown sanity
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 20

--- a/.gitea/workflows/gate-check-v3.yml
+++ b/.gitea/workflows/gate-check-v3.yml
@ -46,6 +46,7 @@ env:
 jobs:
  gate-check:
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # Never block on our own detector failing
    steps:
      - name: Check out BASE ref (never PR-head under pull_request_target)
@ -76,25 +77,32 @@ jobs:
        if: github.event_name == 'schedule'
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
        run: |
          set -euo pipefail
          # Fetch all open PRs and run gate-check on each
          # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN.
          # gate_check.py uses timeout=15 on every urlopen call; this catches the
          # inline Python polling loop too (issue #603).
-          pr_numbers=$(python3 -c "
-            import socket, urllib.request, json, os
-            socket.setdefaulttimeout(15)
-            token = os.environ['GITEA_TOKEN']
-            req = urllib.request.Request(
-                'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
-                headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
-            )
-            with urllib.request.urlopen(req) as r:
-                prs = json.loads(r.read())
-            for pr in prs:
-                print(pr['number'])
-          ")
+          pr_numbers=$(python3 <<'PY'
+          import json
+          import os
+          import socket
+          import urllib.request
+
+          socket.setdefaulttimeout(15)
+          token = os.environ["GITEA_TOKEN"]
+          repo = os.environ["REPO"]
+          req = urllib.request.Request(
+              f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100",
+              headers={"Authorization": f"token {token}", "Accept": "application/json"},
+          )
+          with urllib.request.urlopen(req) as r:
+              prs = json.loads(r.read())
+          for pr in prs:
+              print(pr["number"])
+          PY
+          )
          for pr in $pr_numbers; do
            echo "Checking PR #$pr..."
            python3 tools/gate-check-v3/gate_check.py \
--- a/.gitea/workflows/handlers-postgres-integration.yml
+++ b/.gitea/workflows/handlers-postgres-integration.yml
@ -78,7 +78,8 @@ jobs:
  detect-changes:
    name: detect-changes
    runs-on: ubuntu-latest
-    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      handlers: ${{ steps.filter.outputs.handlers }}
@ -118,7 +119,8 @@ jobs:
    name: Handlers Postgres Integration
    needs: detect-changes
    runs-on: ubuntu-latest
-    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    env:
      # Unique name per run so concurrent jobs don't collide on the
--- a/.gitea/workflows/harness-replays.yml
+++ b/.gitea/workflows/harness-replays.yml
@ -63,6 +63,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      run: ${{ steps.decide.outputs.run }}
@ -154,6 +155,7 @@ jobs:
    name: Harness Replays
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 30
    steps:
--- a/.gitea/workflows/lint-continue-on-error-tracking.yml
+++ b/.gitea/workflows/lint-continue-on-error-tracking.yml
@ -1,6 +1,6 @@
 name: lint-continue-on-error-tracking

-# Tier 2e hard-gate lint (per internal#350) — every
+# Tier 2e hard-gate lint (per mc#664) — every
 # `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a
 # `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines,
 # the referenced issue must be OPEN, and ≤14 days old.
@ -45,11 +45,11 @@ name: lint-continue-on-error-tracking
 # close-and-flip, or document the deliberate keep-mask in a fresh
 # 14-day-renewable tracker. After main is clean for 3 days,
 # follow-up PR flips this workflow's continue-on-error to false.
-# Tracking: internal#350.
+# Tracking: mc#664.
 #
 # Cross-links
 # -----------
-# - internal#350 (the RFC that specs this lint)
+# - mc#664 (the RFC that specs this lint)
 # - mc#664 (the empirical masked-3-weeks case)
 # - feedback_chained_defects_in_never_tested_workflows
 # - feedback_behavior_based_ast_gates
@ -96,8 +96,9 @@ jobs:
    # Phase 3 (RFC #219 §1): surface masked defects without blocking
    # PRs. Pre-existing continue-on-error: true directives on main
    # all violate this lint at first — intentional. Flip to false
-    # follow-up after main is clean for 3 days. internal#350.
-    continue-on-error: true  # internal#350 Phase 3 mask — 14d forced-renewal cadence
+    # follow-up after main is clean for 3 days. mc#664.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
+    continue-on-error: true  # mc#664 Phase 3 mask — 14d forced-renewal cadence
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
--- a/.gitea/workflows/lint-curl-status-capture.yml
+++ b/.gitea/workflows/lint-curl-status-capture.yml
@ -45,6 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/lint-mask-pr-atomicity.yml
+++ b/.gitea/workflows/lint-mask-pr-atomicity.yml
@ -1,6 +1,6 @@
 name: lint-mask-pr-atomicity

-# Tier 2d hard-gate lint (per internal#350) — blocks PRs that touch
+# Tier 2d hard-gate lint (per mc#664) — blocks PRs that touch
 # `.gitea/workflows/ci.yml` and modify ONLY ONE of {continue-on-error,
 # all-required.sentinel.needs} without a `Paired: #NNN` reference in
 # the PR body or in a commit message.
@ -37,11 +37,11 @@ name: lint-mask-pr-atomicity
 # This workflow lands at `continue-on-error: true` (Phase 3 — surface
 # regressions without blocking PRs while the rule beds in).
 # Follow-up PR flips to `false` once we have ≥3 days of clean runs on
-# `main` and no false-positives. Tracking issue: internal#350.
+# `main` and no false-positives. Tracking issue: mc#664.
 #
 # Cross-links
 # -----------
-# - internal#350 (the RFC that specs this lint)
+# - mc#664 (the RFC that specs this lint)
 # - PR#665 / PR#668 (the empirical split-pair)
 # - mc#664 (the main-red incident the split caused)
 # - feedback_strict_root_only_after_class_a
@ -91,7 +91,8 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking
    # PRs. Follow-up PR flips this to `false` once recent runs on main
    # are confirmed clean (eat-our-own-dogfood discipline mirrors
-    # PR#673's same-shape comment). Tracking: internal#350.
+    # PR#673's same-shape comment). Tracking: mc#664.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: Check out PR head with full history (need base SHA blobs)
--- a/.gitea/workflows/lint-workflow-yaml.yml
+++ b/.gitea/workflows/lint-workflow-yaml.yml
@ -55,6 +55,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs.
    # Follow-up PR flips this off after the 4 existing-on-main rule-2
    # (workflow_run) violations are migrated to a supported trigger.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
--- a/.gitea/workflows/publish-canvas-image.yml
+++ b/.gitea/workflows/publish-canvas-image.yml
@ -62,6 +62,7 @@ jobs:
    # See issue #576 + infra-lead pulse ~00:30Z.
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: Checkout
--- a/.gitea/workflows/publish-runtime-autobump.yml
+++ b/.gitea/workflows/publish-runtime-autobump.yml
@ -55,6 +55,7 @@ jobs:
  # The actual bump work happens on the main/staging push after merge.
  pr-validate:
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # do not block PR merge on operational failures
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/railway-pin-audit.yml
+++ b/.gitea/workflows/railway-pin-audit.yml
@ -51,6 +51,7 @@ jobs:
    name: Audit Railway env vars for drift-prone pins
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 10

--- a/.gitea/workflows/redeploy-tenants-on-main.yml
+++ b/.gitea/workflows/redeploy-tenants-on-main.yml
@ -86,6 +86,7 @@ jobs:
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25
    steps:
--- a/.gitea/workflows/redeploy-tenants-on-staging.yml
+++ b/.gitea/workflows/redeploy-tenants-on-staging.yml
@ -76,6 +76,7 @@ jobs:
  redeploy:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25
    steps:
--- a/.gitea/workflows/review-check-tests.yml
+++ b/.gitea/workflows/review-check-tests.yml
@ -53,6 +53,7 @@ jobs:
        # runners with internet access to package mirrors). Falls back to GitHub
        # binary download. GitHub releases may be blocked on some runner networks
        # (infra#241 follow-up).
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        run: |
          if apt-get update -qq && apt-get install -y -qq jq; then
--- a/.gitea/workflows/runtime-pin-compat.yml
+++ b/.gitea/workflows/runtime-pin-compat.yml
@ -67,6 +67,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/runtime-prbuild-compat.yml
+++ b/.gitea/workflows/runtime-prbuild-compat.yml
@ -52,6 +52,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      wheel: ${{ steps.decide.outputs.wheel }}
@ -96,6 +97,7 @@ jobs:
    name: PR-built wheel + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: No-op pass (paths filter excluded this commit)
--- a/.gitea/workflows/secret-pattern-drift.yml
+++ b/.gitea/workflows/secret-pattern-drift.yml
@ -57,6 +57,7 @@ jobs:
    name: Detect SECRET_PATTERNS drift
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 5
    steps:
--- a/.gitea/workflows/sop-tier-check.yml
+++ b/.gitea/workflows/sop-tier-check.yml
@ -64,7 +64,8 @@ jobs:
  tier-check:
    runs-on: ubuntu-latest
    # BURN-IN: continue-on-error prevents AND-composition from blocking
-    # PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
+    # PRs during the 7-day window. Remove after 2026-05-17 (mc#664).
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    permissions:
      contents: read
@ -89,6 +90,7 @@ jobs:
        # runners). The sop-tier-check script has its own fallback as a
        # third line of defense. continue-on-error: true ensures this step
        # failing does not block the job.
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        run: |
          # apt-get is the primary method — Ubuntu package mirrors are reliably
@ -109,6 +111,7 @@ jobs:
        # continue-on-error: true at step level — job-level is ignored by Gitea
        # Actions (quirk #10, internal runbooks). Belt-and-suspenders with
        # SOP_FAIL_OPEN=1 + || true below.
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
--- a/.gitea/workflows/staging-verify.yml
+++ b/.gitea/workflows/staging-verify.yml
@ -85,6 +85,7 @@ jobs:
  staging-smoke:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      sha: ${{ steps.compute.outputs.sha }}
@ -205,6 +206,7 @@ jobs:
    if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    env:
      SHA: ${{ needs.staging-smoke.outputs.sha }}
--- a/.gitea/workflows/sweep-aws-secrets.yml
+++ b/.gitea/workflows/sweep-aws-secrets.yml
@ -29,15 +29,11 @@ name: Sweep stale AWS Secrets Manager secrets
 #     reconciler enumerator) is filed as a separate controlplane
 #     issue. This sweeper is the immediate cost-relief stopgap.
 #
-# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same
-# credentials used by the rest of the platform. The dedicated
-# AWS_JANITOR_* naming (which the original GitHub workflow used) was
-# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have
-# secretsmanager:ListSecrets (the production molecule-cp principal);
-# if ListSecrets is revoked in future, a dedicated janitor principal
-# would need to be created and the Gitea secret names updated here.
+# AWS credentials: use the dedicated Secrets Manager janitor principal.
+# Do not fall back to the molecule-cp application principal: it does
+# not need account-wide ListSecrets, and a 2026-05-12 CI failure proved
+# that using it here turns a least-privilege production credential into
+# a red scheduled janitor.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
 # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
@ -65,6 +61,7 @@ jobs:
    name: Sweep AWS Secrets Manager
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 30 min cap, mirroring the other janitors. AWS DeleteSecret is
    # fast (~0.3s/call) so even a 100+ backlog drains in seconds
@ -73,8 +70,8 @@ jobs:
    timeout-minutes: 30
    env:
      AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_SECRETS_JANITOR_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRETS_JANITOR_SECRET_ACCESS_KEY }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
--- a/.gitea/workflows/sweep-cf-orphans.yml
+++ b/.gitea/workflows/sweep-cf-orphans.yml
@ -71,6 +71,7 @@ jobs:
    name: Sweep CF orphans
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
    # within one cron interval instead of burning a full tick. Realistic
--- a/.gitea/workflows/sweep-cf-tunnels.yml
+++ b/.gitea/workflows/sweep-cf-tunnels.yml
@ -55,6 +55,7 @@ jobs:
    name: Sweep CF tunnels
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 30 min cap. Was 5 min on the theory that the only thing that
    # could take >5min is a CF-API hang — but on 2026-05-02 a backlog
--- a/.gitea/workflows/test-ops-scripts.yml
+++ b/.gitea/workflows/test-ops-scripts.yml
@ -46,6 +46,7 @@ jobs:
    name: Ops scripts (unittest)
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
--- a/.gitea/workflows/weekly-platform-go.yml
+++ b/.gitea/workflows/weekly-platform-go.yml
@ -31,6 +31,7 @@ jobs:
    name: Weekly Platform-Go Surface
    runs-on: ubuntu-latest
    # continue-on-error: surface only, never block
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    defaults:
      run:
--- a/scripts/ops/sweep-aws-secrets.sh
+++ b/scripts/ops/sweep-aws-secrets.sh
@ -239,9 +239,9 @@ for s in d.get("SecretList", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
-TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c "
+TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
 print(n)
@ -256,7 +256,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes + keep-categories worth seeing
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 delete_c = collections.Counter()
 keep_c = collections.Counter()
@ -291,7 +291,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets."
  log ""
  log "First 20 secrets that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@ -327,7 +327,7 @@ RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX)
 # Build delete plan (one ARN per line) and id→name side-channel for
 # failure-log readability. Use ARN rather than Name on the delete
 # call because Name is mutable; ARN is the stable identifier.
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]
--- a/scripts/ops/sweep-cf-tunnels.sh
+++ b/scripts/ops/sweep-cf-tunnels.sh
@ -195,9 +195,9 @@ for t in d.get("result", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT))
-TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c "
+TENANT_TUNNELS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel')
 print(n)
@ -212,7 +212,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 c = collections.Counter()
 for l in sys.stdin:
@ -242,7 +242,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels."
  log ""
  log "First 20 tunnels that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@ -283,7 +283,7 @@ RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX)

 # Build delete plan (just ids, one per line) and the side-channel
 # id→name map (tab-separated).
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, os, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]
--- a/workspace-server/internal/handlers/delegation_executor_integration_test.go
+++ b/workspace-server/internal/handlers/delegation_executor_integration_test.go
@ -42,19 +42,19 @@ import (
 	"net"
 	"net/http"
 	"runtime"
+	"strconv"
 	"testing"
 	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
-	"github.com/alicebob/miniredis/v2"
 )

 // integrationDB is imported from delegation_ledger_integration_test.go.
 // Each test gets a fresh table state.

 const testDelegationID = "del-159-test-integration"
-const testSourceID    = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
-const testTargetID   = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
+const testSourceID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
+const testTargetID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"

 // rawHTTPServer starts a TCP listener, serves one HTTP response, and closes.
 // It runs in a background goroutine so the test can proceed immediately after
@ -73,7 +73,7 @@ func rawHTTPServer(t *testing.T, statusCode int, body string) (serverURL string,
 		t.Fatalf("rawHTTPServer listen: %v", err)
 	}
 	port := ln.Addr().(*net.TCPAddr).Port
-	serverURL = "http://127.0.0.1:" + itoa(port) + "/"
+	serverURL = "http://127.0.0.1:" + strconv.Itoa(port) + "/"

 	connCh := make(chan net.Conn, 1)
 	go func() {
@ -125,31 +125,15 @@ func rawHTTPServer(t *testing.T, statusCode int, body string) (serverURL string,
 	return serverURL, closeFn
 }

-// itoa is an inline integer-to-string helper (avoids importing strconv in tests).
-func itoa(n int) string {
-	if n == 0 {
-		return "0"
-	}
-	if n < 0 {
-		return "-" + itoa(-n)
-	}
-	digits := []byte{}
-	for n > 0 {
-		digits = append([]byte{byte('0' + n%10)}, digits...)
-		n /= 10
-	}
-	return string(digits)
-}
-
 // buildHTTPResponse constructs a minimal HTTP/1.1 response.
 func buildHTTPResponse(statusCode int, body string) []byte {
 	statusText := http.StatusText(statusCode)
 	if statusText == "" {
 		statusText = "Unknown"
 	}
-	header := "HTTP/1.1 " + itoa(statusCode) + " " + statusText + "\r\n" +
+	header := "HTTP/1.1 " + strconv.Itoa(statusCode) + " " + statusText + "\r\n" +
 		"Content-Type: application/json\r\n" +
-		"Content-Length: " + itoa(len(body)) + "\r\n" +
+		"Content-Length: " + strconv.Itoa(len(body)) + "\r\n" +
 		"Connection: close\r\n" +
 		"\r\n"
 	return []byte(header + body)
@ -183,7 +167,7 @@ func setupIntegrationFixtures(t *testing.T, conn *sql.DB) func() {

 	reqBody, _ := json.Marshal(map[string]any{
 		"delegation_id": testDelegationID,
-		"task":         "do work",
+		"task":          "do work",
 	})
 	if _, err := conn.ExecContext(ctx, `
 		INSERT INTO activity_logs
@ -245,14 +229,13 @@ func stack() string {
 }

 // runWithTimeout calls fn in a goroutine and fails t if it doesn't return within
-// timeout. cancel is passed to fn so it can propagate cancellation to
+// timeout. ctx is passed to fn so it can propagate cancellation to
 // executeDelegation's DB and network operations — without this, the goroutine
 // leaks indefinitely when the test times out (context.Background() never cancels).
-// When the timeout fires, cancel() propagates through all blocking ops and the
-// goroutine exits cleanly via runtime.Goexit().
-func runWithTimeout(t *testing.T, timeout time.Duration, fn func(cancel func())) {
+func runWithTimeout(t *testing.T, timeout time.Duration, fn func(context.Context)) {
+	t.Helper()
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	defer cancel() // no-op if ctx expires naturally
+	defer cancel()

 	done := make(chan struct{})
 	var panicErr interface{}
@ -263,7 +246,7 @@ func runWithTimeout(t *testing.T, timeout time.Duration, fn func(cancel func()))
 			}
 			close(done)
 		}()
-		fn(cancel)
+		fn(ctx)
 	}()

 	select {
@ -272,11 +255,8 @@ func runWithTimeout(t *testing.T, timeout time.Duration, fn func(cancel func()))
 			t.Fatalf("executeDelegation panicked: %v\n%s", panicErr, stack())
 		}
 	case <-ctx.Done():
-		// Timeout: cancel the context so executeDelegation's blocking calls
-		// (DB ops, network) unblock. Then exit this goroutine so the
-		// channel closes and the select in the main goroutine can detect
-		// the panic from t.Fatalf and terminate cleanly.
-		runtime.Goexit()
+		cancel()
+		t.Fatalf("executeDelegation timed out after %s\n%s", timeout, stack())
 	}
 }

@ -322,7 +302,7 @@ func TestIntegration_ExecuteDelegation_DeliveryConfirmedProxyError_TreatsAsSucce
 	})

 	start := time.Now()
-	runWithTimeout(t, 30*time.Second, func(cancel func()) {
+	runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
 		dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
 	})
 	t.Logf("executeDelegation took %v", time.Since(start))
@ -374,7 +354,7 @@ func TestIntegration_ExecuteDelegation_ProxyErrorNon2xx_RemainsFailed(t *testing
 		},
 	})
 	start := time.Now()
-	runWithTimeout(t, 30*time.Second, func(cancel func()) {
+	runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
 		dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
 	})
 	t.Logf("executeDelegation took %v", time.Since(start))
@ -423,7 +403,7 @@ func TestIntegration_ExecuteDelegation_ProxyErrorEmptyBody_RemainsFailed(t *test
 		},
 	})
 	start := time.Now()
-	runWithTimeout(t, 30*time.Second, func(cancel func()) {
+	runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
 		dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
 	})
 	t.Logf("executeDelegation took %v", time.Since(start))
@ -471,7 +451,7 @@ func TestIntegration_ExecuteDelegation_CleanProxyResponse_Unchanged(t *testing.T
 		},
 	})
 	start := time.Now()
-	runWithTimeout(t, 30*time.Second, func(cancel func()) {
+	runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
 		dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
 	})
 	t.Logf("executeDelegation took %v", time.Since(start))
@ -516,7 +496,7 @@ func TestIntegration_ExecuteDelegation_RedisDown_FallsBackToDB(t *testing.T) {
 		},
 	})
 	start := time.Now()
-	runWithTimeout(t, 30*time.Second, func(cancel func()) {
+	runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
 		dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
 	})
 	t.Logf("executeDelegation took %v", time.Since(start))
--- a/workspace-server/internal/handlers/mcp_test.go
+++ b/workspace-server/internal/handlers/mcp_test.go
@ -441,8 +441,8 @@ func TestMCPHandler_CommitMemory_GlobalScope_Blocked(t *testing.T) {
 	if resp.Error == nil {
 		t.Error("expected JSON-RPC error for GLOBAL scope, got nil")
 	}
-	if resp.Error != nil && !bytes.Contains([]byte(resp.Error.Message), []byte("GLOBAL")) {
-		t.Errorf("error message should mention GLOBAL, got: %s", resp.Error.Message)
+	if resp.Error != nil && resp.Error.Message != "tool call failed" {
+		t.Errorf("client error should use the OFFSEC constant message, got: %s", resp.Error.Message)
 	}
 	if err := mock.ExpectationsWereMet(); err != nil {
 		t.Errorf("unexpected DB calls on GLOBAL scope block: %v", err)