merge: pull origin/main (PR#772 landed; resolve mcp_test.go conflict preserving OFFSEC-001 assertions)
Some checks failed
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 14s
CI / Detect changes (pull_request) Successful in 34s
E2E API Smoke Test / detect-changes (pull_request) Successful in 36s
E2E Staging SaaS (full lifecycle) / E2E Staging SaaS (pull_request) Has been skipped
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 36s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 22s
Harness Replays / detect-changes (pull_request) Successful in 14s
E2E Staging SaaS (full lifecycle) / pr-validate (pull_request) Successful in 47s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 11s
Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 25s
qa-review / approved (pull_request) Failing after 10s
gate-check-v3 / gate-check (pull_request) Successful in 18s
security-review / approved (pull_request) Failing after 10s
sop-checklist / all-items-acked (pull_request) acked: 0/7 — missing: comprehensive-testing, local-postgres-e2e, staging-smoke, +4 — body-unfilled: 7
sop-checklist-gate / gate (pull_request) Successful in 10s
CI / Canvas (Next.js) (pull_request) Successful in 4s
sop-tier-check / tier-check (pull_request) Successful in 10s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 2s
CI / Python Lint & Test (pull_request) Successful in 5s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 5s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m11s
Harness Replays / Harness Replays (pull_request) Successful in 4s
Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Successful in 6s
CI / Canvas Deploy Reminder (pull_request) Has been skipped
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 3m51s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 4m6s
CI / Platform (Go) (pull_request) Successful in 6m33s
CI / all-required (pull_request) Successful in 1s
audit-force-merge / audit (pull_request) Successful in 3s

This commit is contained in:
core-devops 2026-05-13 00:18:16 +00:00
commit 566bafe42c
40 changed files with 201 additions and 108 deletions

View File

@ -37,6 +37,7 @@ jobs:
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

View File

@ -48,6 +48,7 @@ jobs:
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

View File

@ -45,6 +45,7 @@ jobs:
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 5
steps:

View File

@ -148,6 +148,7 @@ jobs:
# a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing.
# Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint)
# retain continue-on-error: false; only platform-build regresses.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709)
defaults:
run:
@ -186,6 +187,7 @@ jobs:
echo "::group::pendinguploads exit=$pu_exit (last 100 lines)"
tail -100 /tmp/test-pu.log
echo "::endgroup::"
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
- if: needs.changes.outputs.platform == 'true'
name: Run tests with race detection and coverage
@ -372,6 +374,7 @@ jobs:
canvas-deploy-reminder:
name: Canvas Deploy Reminder
runs-on: ubuntu-latest
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
needs: [changes, canvas-build]
# Only fires on direct pushes to main (i.e. after staging→main promotion).

View File

@ -90,6 +90,7 @@ jobs:
name: Synthetic E2E against staging
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
# Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
# (apt-get update + install docker.io/jq/awscli/caddy + snap install

View File

@ -103,6 +103,7 @@ jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
outputs:
api: ${{ steps.decide.outputs.api }}
@ -154,6 +155,7 @@ jobs:
name: E2E API Smoke Test
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 15
env:
@ -164,7 +166,6 @@ jobs:
# we let Docker assign an ephemeral host port.
PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
PORT: "8080"
steps:
- name: No-op pass (paths filter excluded this commit)
if: needs.detect-changes.outputs.api != 'true'
@ -268,6 +269,20 @@ jobs:
if: needs.detect-changes.outputs.api == 'true'
working-directory: workspace-server
run: go build -o platform-server ./cmd/server
- name: Pick platform port
if: needs.detect-changes.outputs.api == 'true'
run: |
PLATFORM_PORT=$(python3 - <<'PY'
import socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("127.0.0.1", 0))
print(s.getsockname()[1])
PY
)
echo "PORT=${PLATFORM_PORT}" >> "$GITHUB_ENV"
echo "BASE=http://127.0.0.1:${PLATFORM_PORT}" >> "$GITHUB_ENV"
echo "Platform host port: ${PLATFORM_PORT}"
- name: Start platform (background)
if: needs.detect-changes.outputs.api == 'true'
working-directory: workspace-server
@ -280,7 +295,7 @@ jobs:
if: needs.detect-changes.outputs.api == 'true'
run: |
for i in $(seq 1 30); do
if curl -sf http://127.0.0.1:8080/health > /dev/null; then
if curl -sf "$BASE/health" > /dev/null; then
echo "Platform up after ${i}s"
exit 0
fi

View File

@ -70,6 +70,7 @@ jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
outputs:
canvas: ${{ steps.decide.outputs.canvas }}
@ -118,6 +119,7 @@ jobs:
name: Canvas tabs E2E
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 40

View File

@ -84,6 +84,7 @@ jobs:
name: E2E Staging External Runtime
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 25

View File

@ -88,17 +88,20 @@ jobs:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 1
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.11"
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
- name: YAML validation (best-effort)
run: |
echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid."
echo "E2E step runs only when provisioning-critical files change."
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
# Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only
@ -109,6 +112,7 @@ jobs:
# Only runs on trunk pushes. PR paths get pr-validate instead.
if: github.event.pull_request.base.ref == ''
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 45
permissions:

View File

@ -37,6 +37,7 @@ jobs:
name: Intentional-failure teardown sanity
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 20

View File

@ -46,6 +46,7 @@ env:
jobs:
gate-check:
runs-on: ubuntu-latest
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true # Never block on our own detector failing
steps:
- name: Check out BASE ref (never PR-head under pull_request_target)
@ -76,25 +77,32 @@ jobs:
if: github.event_name == 'schedule'
env:
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
run: |
set -euo pipefail
# Fetch all open PRs and run gate-check on each
# socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN.
# gate_check.py uses timeout=15 on every urlopen call; this catches the
# inline Python polling loop too (issue #603).
pr_numbers=$(python3 -c "
import socket, urllib.request, json, os
socket.setdefaulttimeout(15)
token = os.environ['GITEA_TOKEN']
req = urllib.request.Request(
'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
)
with urllib.request.urlopen(req) as r:
prs = json.loads(r.read())
for pr in prs:
print(pr['number'])
")
pr_numbers=$(python3 <<'PY'
import json
import os
import socket
import urllib.request
socket.setdefaulttimeout(15)
token = os.environ["GITEA_TOKEN"]
repo = os.environ["REPO"]
req = urllib.request.Request(
f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100",
headers={"Authorization": f"token {token}", "Accept": "application/json"},
)
with urllib.request.urlopen(req) as r:
prs = json.loads(r.read())
for pr in prs:
print(pr["number"])
PY
)
for pr in $pr_numbers; do
echo "Checking PR #$pr..."
python3 tools/gate-check-v3/gate_check.py \

View File

@ -78,7 +78,8 @@ jobs:
detect-changes:
name: detect-changes
runs-on: ubuntu-latest
# internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
# mc#664 Phase 3 (RFC §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
outputs:
handlers: ${{ steps.filter.outputs.handlers }}
@ -118,7 +119,8 @@ jobs:
name: Handlers Postgres Integration
needs: detect-changes
runs-on: ubuntu-latest
# internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
# mc#664 Phase 3 (RFC §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
env:
# Unique name per run so concurrent jobs don't collide on the

View File

@ -63,6 +63,7 @@ jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
outputs:
run: ${{ steps.decide.outputs.run }}
@ -154,6 +155,7 @@ jobs:
name: Harness Replays
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 30
steps:

View File

@ -1,6 +1,6 @@
name: lint-continue-on-error-tracking
# Tier 2e hard-gate lint (per internal#350) — every
# Tier 2e hard-gate lint (per mc#664) — every
# `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a
# `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines,
# the referenced issue must be OPEN, and ≤14 days old.
@ -45,11 +45,11 @@ name: lint-continue-on-error-tracking
# close-and-flip, or document the deliberate keep-mask in a fresh
# 14-day-renewable tracker. After main is clean for 3 days,
# follow-up PR flips this workflow's continue-on-error to false.
# Tracking: internal#350.
# Tracking: mc#664.
#
# Cross-links
# -----------
# - internal#350 (the RFC that specs this lint)
# - mc#664 (the RFC that specs this lint)
# - mc#664 (the empirical masked-3-weeks case)
# - feedback_chained_defects_in_never_tested_workflows
# - feedback_behavior_based_ast_gates
@ -96,8 +96,9 @@ jobs:
# Phase 3 (RFC #219 §1): surface masked defects without blocking
# PRs. Pre-existing continue-on-error: true directives on main
# all violate this lint at first — intentional. Flip to false
# follow-up after main is clean for 3 days. internal#350.
continue-on-error: true # internal#350 Phase 3 mask — 14d forced-renewal cadence
# follow-up after main is clean for 3 days. mc#664.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true # mc#664 Phase 3 mask — 14d forced-renewal cadence
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0

View File

@ -45,6 +45,7 @@ jobs:
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

View File

@ -1,6 +1,6 @@
name: lint-mask-pr-atomicity
# Tier 2d hard-gate lint (per internal#350) — blocks PRs that touch
# Tier 2d hard-gate lint (per mc#664) — blocks PRs that touch
# `.gitea/workflows/ci.yml` and modify ONLY ONE of {continue-on-error,
# all-required.sentinel.needs} without a `Paired: #NNN` reference in
# the PR body or in a commit message.
@ -37,11 +37,11 @@ name: lint-mask-pr-atomicity
# This workflow lands at `continue-on-error: true` (Phase 3 — surface
# regressions without blocking PRs while the rule beds in).
# Follow-up PR flips to `false` once we have ≥3 days of clean runs on
# `main` and no false-positives. Tracking issue: internal#350.
# `main` and no false-positives. Tracking issue: mc#664.
#
# Cross-links
# -----------
# - internal#350 (the RFC that specs this lint)
# - mc#664 (the RFC that specs this lint)
# - PR#665 / PR#668 (the empirical split-pair)
# - mc#664 (the main-red incident the split caused)
# - feedback_strict_root_only_after_class_a
@ -91,7 +91,8 @@ jobs:
# Phase 3 (RFC #219 §1): surface broken shapes without blocking
# PRs. Follow-up PR flips this to `false` once recent runs on main
# are confirmed clean (eat-our-own-dogfood discipline mirrors
# PR#673's same-shape comment). Tracking: internal#350.
# PR#673's same-shape comment). Tracking: mc#664.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- name: Check out PR head with full history (need base SHA blobs)

View File

@ -55,6 +55,7 @@ jobs:
# Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs.
# Follow-up PR flips this off after the 4 existing-on-main rule-2
# (workflow_run) violations are migrated to a supported trigger.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

View File

@ -62,6 +62,7 @@ jobs:
# See issue #576 + infra-lead pulse ~00:30Z.
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- name: Checkout

View File

@ -55,6 +55,7 @@ jobs:
# The actual bump work happens on the main/staging push after merge.
pr-validate:
runs-on: ubuntu-latest
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true # do not block PR merge on operational failures
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

View File

@ -51,6 +51,7 @@ jobs:
name: Audit Railway env vars for drift-prone pins
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 10

View File

@ -86,6 +86,7 @@ jobs:
if: ${{ github.event.workflow_run.conclusion == 'success' }}
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 25
steps:

View File

@ -76,6 +76,7 @@ jobs:
redeploy:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 25
steps:

View File

@ -53,6 +53,7 @@ jobs:
# runners with internet access to package mirrors). Falls back to GitHub
# binary download. GitHub releases may be blocked on some runner networks
# (infra#241 follow-up).
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
run: |
if apt-get update -qq && apt-get install -y -qq jq; then

View File

@ -67,6 +67,7 @@ jobs:
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

View File

@ -52,6 +52,7 @@ jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
outputs:
wheel: ${{ steps.decide.outputs.wheel }}
@ -96,6 +97,7 @@ jobs:
name: PR-built wheel + import smoke
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- name: No-op pass (paths filter excluded this commit)

View File

@ -57,6 +57,7 @@ jobs:
name: Detect SECRET_PATTERNS drift
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
timeout-minutes: 5
steps:

View File

@ -64,7 +64,8 @@ jobs:
tier-check:
runs-on: ubuntu-latest
# BURN-IN: continue-on-error prevents AND-composition from blocking
# PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
# PRs during the 7-day window. Remove after 2026-05-17 (mc#664).
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
permissions:
contents: read
@ -89,6 +90,7 @@ jobs:
# runners). The sop-tier-check script has its own fallback as a
# third line of defense. continue-on-error: true ensures this step
# failing does not block the job.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
run: |
# apt-get is the primary method — Ubuntu package mirrors are reliably
@ -109,6 +111,7 @@ jobs:
# continue-on-error: true at step level — job-level is ignored by Gitea
# Actions (quirk #10, internal runbooks). Belt-and-suspenders with
# SOP_FAIL_OPEN=1 + || true below.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
env:
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}

View File

@ -85,6 +85,7 @@ jobs:
staging-smoke:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
outputs:
sha: ${{ steps.compute.outputs.sha }}
@ -205,6 +206,7 @@ jobs:
if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
env:
SHA: ${{ needs.staging-smoke.outputs.sha }}

View File

@ -29,15 +29,11 @@ name: Sweep stale AWS Secrets Manager secrets
# reconciler enumerator) is filed as a separate controlplane
# issue. This sweeper is the immediate cost-relief stopgap.
#
# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID /
# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same
# credentials used by the rest of the platform. The dedicated
# AWS_JANITOR_* naming (which the original GitHub workflow used) was
# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID /
# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have
# secretsmanager:ListSecrets (the production molecule-cp principal);
# if ListSecrets is revoked in future, a dedicated janitor principal
# would need to be created and the Gitea secret names updated here.
# AWS credentials: use the dedicated Secrets Manager janitor principal.
# Do not fall back to the molecule-cp application principal: it does
# not need account-wide ListSecrets, and a 2026-05-12 CI failure proved
# that using it here turns a least-privilege production credential into
# a red scheduled janitor.
#
# Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
# sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
@ -65,6 +61,7 @@ jobs:
name: Sweep AWS Secrets Manager
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
# 30 min cap, mirroring the other janitors. AWS DeleteSecret is
# fast (~0.3s/call) so even a 100+ backlog drains in seconds
@ -73,8 +70,8 @@ jobs:
timeout-minutes: 30
env:
AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_SECRETS_JANITOR_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRETS_JANITOR_SECRET_ACCESS_KEY }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}

View File

@ -71,6 +71,7 @@ jobs:
name: Sweep CF orphans
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
# 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
# within one cron interval instead of burning a full tick. Realistic

View File

@ -55,6 +55,7 @@ jobs:
name: Sweep CF tunnels
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
# 30 min cap. Was 5 min on the theory that the only thing that
# could take >5min is a CF-API hang — but on 2026-05-02 a backlog

View File

@ -46,6 +46,7 @@ jobs:
name: Ops scripts (unittest)
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

View File

@ -31,6 +31,7 @@ jobs:
name: Weekly Platform-Go Surface
runs-on: ubuntu-latest
# continue-on-error: surface only, never block
# mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
defaults:
run:

View File

@ -239,9 +239,9 @@ for s in d.get("SecretList", []):
# --- Summarize + safety gate ----------------------------------------------
DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c "
TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
import json, sys
n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
print(n)
@ -256,7 +256,7 @@ log " would keep: $KEEP_COUNT"
log ""
# Per-reason breakdown of deletes + keep-categories worth seeing
echo "$DECISIONS" | python3 -c "
printf '%s' "$DECISIONS" | python3 -c "
import json,sys,collections
delete_c = collections.Counter()
keep_c = collections.Counter()
@ -291,7 +291,7 @@ if [ "$DRY_RUN" = "1" ]; then
log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets."
log ""
log "First 20 secrets that would be deleted:"
echo "$DECISIONS" | python3 -c "
printf '%s' "$DECISIONS" | python3 -c "
import json, sys
shown = 0
for l in sys.stdin:
@ -327,7 +327,7 @@ RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX)
# Build delete plan (one ARN per line) and id→name side-channel for
# failure-log readability. Use ARN rather than Name on the delete
# call because Name is mutable; ARN is the stable identifier.
echo "$DECISIONS" | python3 -c '
printf '%s' "$DECISIONS" | python3 -c '
import json, sys
plan_path = sys.argv[1]
map_path = sys.argv[2]

View File

@ -195,9 +195,9 @@ for t in d.get("result", []):
# --- Summarize + safety gate ----------------------------------------------
DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT))
TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c "
TENANT_TUNNELS=$(printf '%s' "$DECISIONS" | python3 -c "
import json, sys
n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel')
print(n)
@ -212,7 +212,7 @@ log " would keep: $KEEP_COUNT"
log ""
# Per-reason breakdown of deletes
echo "$DECISIONS" | python3 -c "
printf '%s' "$DECISIONS" | python3 -c "
import json,sys,collections
c = collections.Counter()
for l in sys.stdin:
@ -242,7 +242,7 @@ if [ "$DRY_RUN" = "1" ]; then
log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels."
log ""
log "First 20 tunnels that would be deleted:"
echo "$DECISIONS" | python3 -c "
printf '%s' "$DECISIONS" | python3 -c "
import json, sys
shown = 0
for l in sys.stdin:
@ -283,7 +283,7 @@ RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX)
# Build delete plan (just ids, one per line) and the side-channel
# id→name map (tab-separated).
echo "$DECISIONS" | python3 -c '
printf '%s' "$DECISIONS" | python3 -c '
import json, os, sys
plan_path = sys.argv[1]
map_path = sys.argv[2]

View File

@ -392,6 +392,25 @@ func (h *DelegationHandler) executeDelegation(ctx context.Context, sourceID, tar
return
}
if status >= 200 && status < 300 && len(respBody) == 0 {
errMsg := "workspace agent returned empty response"
log.Printf("Delegation %s: step=handling_failure err=%s", delegationID, errMsg)
h.updateDelegationStatus(ctx, sourceID, delegationID, "failed", errMsg)
if _, err := db.DB.ExecContext(ctx, `
INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, status, error_detail)
VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, 'failed', $5)
`, sourceID, sourceID, targetID, "Delegation failed", errMsg); err != nil {
log.Printf("Delegation %s: failed to insert empty-response error log: %v", delegationID, err)
}
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationFailed), sourceID, map[string]interface{}{
"delegation_id": delegationID, "target_id": targetID, "error": errMsg,
})
pushDelegationResultToInbox(ctx, sourceID, delegationID, "failed", "", errMsg)
return
}
handleSuccess:
log.Printf("Delegation %s: step=handle_success status=%d", delegationID, status)
@ -797,4 +816,3 @@ func extractResponseText(body []byte) string {
}
return string(body)
}

View File

@ -42,19 +42,19 @@ import (
"net"
"net/http"
"runtime"
"strconv"
"testing"
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/alicebob/miniredis/v2"
)
// integrationDB is imported from delegation_ledger_integration_test.go.
// Each test gets a fresh table state.
const testDelegationID = "del-159-test-integration"
const testSourceID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
const testTargetID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
const testSourceID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
const testTargetID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
// rawHTTPServer starts a TCP listener, serves one HTTP response, and closes.
// It runs in a background goroutine so the test can proceed immediately after
@ -73,7 +73,7 @@ func rawHTTPServer(t *testing.T, statusCode int, body string) (serverURL string,
t.Fatalf("rawHTTPServer listen: %v", err)
}
port := ln.Addr().(*net.TCPAddr).Port
serverURL = "http://127.0.0.1:" + itoa(port) + "/"
serverURL = "http://127.0.0.1:" + strconv.Itoa(port) + "/"
connCh := make(chan net.Conn, 1)
go func() {
@ -125,31 +125,15 @@ func rawHTTPServer(t *testing.T, statusCode int, body string) (serverURL string,
return serverURL, closeFn
}
// itoa is an inline integer-to-string helper (avoids importing strconv in tests).
func itoa(n int) string {
if n == 0 {
return "0"
}
if n < 0 {
return "-" + itoa(-n)
}
digits := []byte{}
for n > 0 {
digits = append([]byte{byte('0' + n%10)}, digits...)
n /= 10
}
return string(digits)
}
// buildHTTPResponse constructs a minimal HTTP/1.1 response.
func buildHTTPResponse(statusCode int, body string) []byte {
statusText := http.StatusText(statusCode)
if statusText == "" {
statusText = "Unknown"
}
header := "HTTP/1.1 " + itoa(statusCode) + " " + statusText + "\r\n" +
header := "HTTP/1.1 " + strconv.Itoa(statusCode) + " " + statusText + "\r\n" +
"Content-Type: application/json\r\n" +
"Content-Length: " + itoa(len(body)) + "\r\n" +
"Content-Length: " + strconv.Itoa(len(body)) + "\r\n" +
"Connection: close\r\n" +
"\r\n"
return []byte(header + body)
@ -183,7 +167,7 @@ func setupIntegrationFixtures(t *testing.T, conn *sql.DB) func() {
reqBody, _ := json.Marshal(map[string]any{
"delegation_id": testDelegationID,
"task": "do work",
"task": "do work",
})
if _, err := conn.ExecContext(ctx, `
INSERT INTO activity_logs
@ -245,14 +229,13 @@ func stack() string {
}
// runWithTimeout calls fn in a goroutine and fails t if it doesn't return within
// timeout. cancel is passed to fn so it can propagate cancellation to
// timeout. ctx is passed to fn so it can propagate cancellation to
// executeDelegation's DB and network operations — without this, the goroutine
// leaks indefinitely when the test times out (context.Background() never cancels).
// When the timeout fires, cancel() propagates through all blocking ops and the
// goroutine exits cleanly via runtime.Goexit().
func runWithTimeout(t *testing.T, timeout time.Duration, fn func(cancel func())) {
func runWithTimeout(t *testing.T, timeout time.Duration, fn func(context.Context)) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel() // no-op if ctx expires naturally
defer cancel()
done := make(chan struct{})
var panicErr interface{}
@ -263,7 +246,7 @@ func runWithTimeout(t *testing.T, timeout time.Duration, fn func(cancel func()))
}
close(done)
}()
fn(cancel)
fn(ctx)
}()
select {
@ -272,11 +255,8 @@ func runWithTimeout(t *testing.T, timeout time.Duration, fn func(cancel func()))
t.Fatalf("executeDelegation panicked: %v\n%s", panicErr, stack())
}
case <-ctx.Done():
// Timeout: cancel the context so executeDelegation's blocking calls
// (DB ops, network) unblock. Then exit this goroutine so the
// channel closes and the select in the main goroutine can detect
// the panic from t.Fatalf and terminate cleanly.
runtime.Goexit()
cancel()
t.Fatalf("executeDelegation timed out after %s\n%s", timeout, stack())
}
}
@ -322,7 +302,7 @@ func TestIntegration_ExecuteDelegation_DeliveryConfirmedProxyError_TreatsAsSucce
})
start := time.Now()
runWithTimeout(t, 30*time.Second, func(cancel func()) {
runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
})
t.Logf("executeDelegation took %v", time.Since(start))
@ -374,7 +354,7 @@ func TestIntegration_ExecuteDelegation_ProxyErrorNon2xx_RemainsFailed(t *testing
},
})
start := time.Now()
runWithTimeout(t, 30*time.Second, func(cancel func()) {
runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
})
t.Logf("executeDelegation took %v", time.Since(start))
@ -423,7 +403,7 @@ func TestIntegration_ExecuteDelegation_ProxyErrorEmptyBody_RemainsFailed(t *test
},
})
start := time.Now()
runWithTimeout(t, 30*time.Second, func(cancel func()) {
runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
})
t.Logf("executeDelegation took %v", time.Since(start))
@ -471,7 +451,7 @@ func TestIntegration_ExecuteDelegation_CleanProxyResponse_Unchanged(t *testing.T
},
})
start := time.Now()
runWithTimeout(t, 30*time.Second, func(cancel func()) {
runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
})
t.Logf("executeDelegation took %v", time.Since(start))
@ -516,7 +496,7 @@ func TestIntegration_ExecuteDelegation_RedisDown_FallsBackToDB(t *testing.T) {
},
})
start := time.Now()
runWithTimeout(t, 30*time.Second, func(cancel func()) {
runWithTimeout(t, 30*time.Second, func(ctx context.Context) {
dh.executeDelegation(ctx, testSourceID, testTargetID, testDelegationID, a2aBody)
})
t.Logf("executeDelegation took %v", time.Since(start))

View File

@ -154,10 +154,28 @@ func (l *DelegationLedger) SetStatus(ctx context.Context,
return err
}
// Same-status replay (e.g. duplicate completion notification): no-op,
// don't bump updated_at, no error.
// Same-status replay (e.g. duplicate completion notification): usually a
// no-op. If the replay carries terminal detail that the first write lacked,
// fill the missing nullable column once. This keeps duplicate notifications
// idempotent while preserving the first observed result/error when a legacy
// path wrote the terminal status before it had the detail payload.
if current == status {
return nil
if errorDetail == "" && resultPreview == "" {
return nil
}
_, err = l.db.ExecContext(ctx, `
UPDATE delegations
SET error_detail = COALESCE(error_detail, NULLIF($2, '')),
result_preview = COALESCE(result_preview, NULLIF($3, '')),
updated_at = CASE
WHEN (error_detail IS NULL AND NULLIF($2, '') IS NOT NULL)
OR (result_preview IS NULL AND NULLIF($3, '') IS NOT NULL)
THEN now()
ELSE updated_at
END
WHERE delegation_id = $1
`, delegationID, errorDetail, textutil.TruncateBytesNoMarker(resultPreview, previewCap))
return err
}
// Forward-only on terminal states.

View File

@ -150,16 +150,11 @@ func TestIntegration_ResultPreviewPreservedThroughCompletion(t *testing.T) {
}
}
// TestIntegration_ResultPreviewBuggyOrderIsLost — DIAGNOSTIC test that
// confirms the ORIGINAL buggy order does lose the preview. Useful when
// auditing similar wiring elsewhere.
//
// This is documented behavior: it asserts the same-status replay no-op
// works as designed in DelegationLedger.SetStatus. The fix in
// delegation.go is to AVOID this order, not to change SetStatus's
// same-status semantics (which the operator dashboard relies on for
// idempotent completion notifications).
func TestIntegration_ResultPreviewBuggyOrderIsLost(t *testing.T) {
// Same-status terminal replays remain idempotent, but if the first terminal
// write lacked result_preview, a later same-status replay carrying the preview
// should fill that missing field once. This protects legacy call ordering and
// mirrors the failure-path error_detail repair.
func TestIntegration_ResultPreviewSameStatusReplayFillsMissingPreview(t *testing.T) {
conn := integrationDB(t)
t.Setenv("DELEGATION_LEDGER_WRITE", "1")
@ -167,16 +162,17 @@ func TestIntegration_ResultPreviewBuggyOrderIsLost(t *testing.T) {
caller := "11111111-1111-1111-1111-111111111111"
callee := "22222222-2222-2222-2222-222222222222"
// BUGGY sequence in production-shape order: queued → dispatched →
// completed (no preview) → completed (preview ignored as same-status).
// Legacy sequence: queued → dispatched → completed (no preview) →
// completed (preview). The second completed replay should repair the
// missing preview without changing status.
recordLedgerInsert(context.Background(), caller, callee, id, "the question", "")
recordLedgerStatus(context.Background(), id, "dispatched", "", "") // pre-completion stage
recordLedgerStatus(context.Background(), id, "completed", "", "") // inner first
recordLedgerStatus(context.Background(), id, "completed", "", "the answer") // outer same-status no-op
recordLedgerStatus(context.Background(), id, "dispatched", "", "")
recordLedgerStatus(context.Background(), id, "completed", "", "")
recordLedgerStatus(context.Background(), id, "completed", "", "the answer")
_, preview, _ := readLedgerRow(t, conn, id)
if preview != "" {
t.Errorf("buggy-order preview was unexpectedly non-empty: %q (SetStatus same-status no-op contract may have changed)", preview)
if preview != "the answer" {
t.Errorf("same-status replay should fill missing preview; got %q", preview)
}
}

View File

@ -226,6 +226,25 @@ func TestLedgerSetStatus_SameStatusReplay_NoUpdate(t *testing.T) {
}
}
func TestLedgerSetStatus_SameStatusReplay_FillsMissingDetail(t *testing.T) {
mock := setupTestDB(t)
l := NewDelegationLedger(nil)
mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`).
WithArgs("d-1").
WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("failed"))
mock.ExpectExec(`UPDATE delegations\s+SET error_detail = COALESCE\(error_detail, NULLIF\(\$2, ''\)\),\s+result_preview = COALESCE\(result_preview, NULLIF\(\$3, ''\)\),\s+updated_at = CASE`).
WithArgs("d-1", "agent returned empty response", "").
WillReturnResult(sqlmock.NewResult(0, 1))
if err := l.SetStatus(context.Background(), "d-1", "failed", "agent returned empty response", ""); err != nil {
t.Errorf("same-status detail fill should succeed, got err: %v", err)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet: %v", err)
}
}
func TestLedgerSetStatus_MissingRowIsNoOp(t *testing.T) {
// A SetStatus call that arrives before Insert (lost INSERT, race, etc.)
// must NOT error — it's a transient inconsistency the next agent retry