499 changed files with 4566 additions and 62367 deletions
--- a/.gitea/scripts/audit-force-merge.sh
+++ b/.gitea/scripts/audit-force-merge.sh
@ -1,118 +0,0 @@
 #!/usr/bin/env bash
 # audit-force-merge — detect a §SOP-6 force-merge after PR close, emit
 # `incident.force_merge` to stdout as structured JSON.
 #
 # Vector's docker_logs source picks up runner stdout; the JSON gets
 # shipped to Loki on molecule-canonical-obs, indexable by event_type.
 # Query example:
 #
 #   {host="operator"} |= "event_type" |= "incident.force_merge" | json
 #
 # A force-merge is detected when a PR closed-with-merged=true had at
 # least one of the repo's required-status-check contexts in a state
 # other than "success" at the merge commit's SHA. That's exactly what
 # the Gitea force_merge:true API call lets through, so it's a faithful
 # detector of the override path.
 #
 # Triggers on `pull_request_target: closed` (loaded from base branch
 # per §SOP-6 security model). No-op when merged=false.
 #
 # Required env (set by the workflow):
 #   GITEA_TOKEN, GITEA_HOST, REPO, PR_NUMBER, REQUIRED_CHECKS
 #
 # REQUIRED_CHECKS is a newline-separated list of status-check context
 # names that branch protection requires. Declared in the workflow YAML
 # rather than fetched from /branch_protections (which needs admin
 # scope — sop-tier-bot has read-only). Trade dynamism for simplicity:
 # when the required-check set changes, update both branch protection
 # AND this env. Keeping them in sync is less complexity than granting
 # the audit bot admin perms on every repo.
 set -euo pipefail
 : "${GITEA_TOKEN:?required}"
 : "${GITEA_HOST:?required}"
 : "${REPO:?required}"
 : "${PR_NUMBER:?required}"
 : "${REQUIRED_CHECKS:?required (newline-separated context names)}"
 OWNER="${REPO%%/*}"
 NAME="${REPO##*/}"
 API="https://${GITEA_HOST}/api/v1"
 AUTH="Authorization: token ${GITEA_TOKEN}"
 # 1. Fetch the PR. If not merged, no-op.
 PR=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
 MERGED=$(echo "$PR" | jq -r '.merged // false')
 if [ "$MERGED" != "true" ]; then
  echo "::notice::PR #${PR_NUMBER} closed without merge — no audit emission."
  exit 0
 fi
 MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty') || true
 MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"') || true
 TITLE=$(echo "$PR" | jq -r '.title // ""') || true
 BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"') || true
 HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty') || true
 if [ -z "$MERGE_SHA" ]; then
  echo "::warning::PR #${PR_NUMBER} merged=true but no merge_commit_sha — cannot evaluate force-merge."
  exit 0
 fi
 # 2. Required status checks declared in the workflow env.
 REQUIRED="$REQUIRED_CHECKS"
 if [ -z "${REQUIRED//[[:space:]]/}" ]; then
  echo "::notice::REQUIRED_CHECKS empty — force-merge not applicable."
  exit 0
 fi
 # 3. Status-check state at the PR HEAD (where checks ran). The merge
 #    commit doesn't get its own checks; we evaluate the PR's last
 #    commit, which is what branch protection compared against.
 STATUS=$(curl -sS -H "$AUTH" \
  "${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
 declare -A CHECK_STATE
 while IFS=$'\t' read -r ctx state; do
  [ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
 done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"') || true
 # 4. For each required check, was it green at merge? YAML block scalars
 #    (`|`) leave a trailing newline; skip blank/whitespace-only lines.
 FAILED_CHECKS=()
 while IFS= read -r req; do
  trimmed="${req#"${req%%[![:space:]]*}"}"   # ltrim
  trimmed="${trimmed%"${trimmed##*[![:space:]]}"}"  # rtrim
  [ -z "$trimmed" ] && continue
  state="${CHECK_STATE[$trimmed]:-missing}"
  if [ "$state" != "success" ]; then
    FAILED_CHECKS+=("${trimmed}=${state}")
  fi
 done <<< "$REQUIRED"
 if [ "${#FAILED_CHECKS[@]}" -eq 0 ]; then
  echo "::notice::PR #${PR_NUMBER} merged with all required checks green — not a force-merge."
  exit 0
 fi
 # 5. Emit structured audit event.
 NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
 FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .) || true
 # Print as a single-line JSON so Vector's parse_json transform can pick
 # it up cleanly from docker_logs.
 jq -nc \
  --arg event_type "incident.force_merge" \
  --arg ts "$NOW" \
  --arg repo "$REPO" \
  --argjson pr "$PR_NUMBER" \
  --arg title "$TITLE" \
  --arg base "$BASE_BRANCH" \
  --arg merged_by "$MERGED_BY" \
  --arg merge_sha "$MERGE_SHA" \
  --argjson failed_checks "$FAILED_JSON" \
  '{event_type: $event_type, ts: $ts, repo: $repo, pr: $pr, title: $title,
    base_branch: $base, merged_by: $merged_by, merge_sha: $merge_sha,
    failed_checks: $failed_checks}'
 echo "::warning::FORCE-MERGE detected on PR #${PR_NUMBER} by ${MERGED_BY}: ${#FAILED_CHECKS[@]} required check(s) not green at merge time."
--- a/.gitea/scripts/ci-required-drift.py
+++ b/.gitea/scripts/ci-required-drift.py
@ -1,644 +0,0 @@
 #!/usr/bin/env python3
 """ci-required-drift — RFC internal#219 §4 + §6.
 Detects drift between three sources of "what counts as a required check"
 for this repo, files (or updates) a `[ci-drift]` Gitea issue when any
 pair diverges.
 Sources:
  A. `.gitea/workflows/ci.yml` jobs  (CI source — the actual job set)
  B. `status_check_contexts` in branch_protections (the merge gate)
  C. `REQUIRED_CHECKS` env in audit-force-merge.yml (the audit env)
 Three failure classes:
  F1  Job in (A) is not under the sentinel's `needs:` — sentinel
      doesn't gate it, so a red job on that name can sneak through.
      Ignores jobs whose `if:` references `github.event_name` (those
      run only on specific events and may be `skipped` legitimately).
  F2  Context in (B) corresponds to no emitter — i.e. there's no job
      in ci.yml whose runtime status-name maps to that context.
      A stale required-check name is silent: protection demands a
      green it never receives, but Gitea treats absent-as-pending,
      not absent-as-red. The gate degrades to advisory.
  F3  (B) and (C) are not set-equal. Audit env wider than protection
      → audit flags non-force-merges as force; narrower → real
      force-merges are missed.
 Idempotency:
  Searches OPEN issues by exact title prefix
  `[ci-drift] {repo}/{branch}: ` and either edits the existing one
  (if any) or POSTs a new one. Never spawns duplicates.
 Behavior-based AST gate per `feedback_behavior_based_ast_gates`:
  - Job set comes from PyYAML parse of jobs:* keys
  - Sentinel needs from PyYAML parse of jobs[sentinel].needs (a list)
  - Audit env from PyYAML parse, NOT grep — so reformatting the YAML
    (block-scalar `|` vs flow-style list) does not break the gate
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from typing import Any
 import yaml  # PyYAML 6.0.2 — installed by the workflow before this runs.
 # --------------------------------------------------------------------------
 # Environment
 # --------------------------------------------------------------------------
 def env(key: str, *, required: bool = True, default: str | None = None) -> str:
    val = os.environ.get(key, default)
    if required and not val:
        sys.stderr.write(f"::error::missing required env var: {key}\n")
        sys.exit(2)
    return val or ""
 GITEA_TOKEN = env("GITEA_TOKEN", required=False)
 GITEA_HOST = env("GITEA_HOST", required=False)
 REPO = env("REPO", required=False)
 BRANCHES = env("BRANCHES", required=False).split()
 SENTINEL_JOB = env("SENTINEL_JOB", required=False)
 AUDIT_WORKFLOW_PATH = env("AUDIT_WORKFLOW_PATH", required=False)
 CI_WORKFLOW_PATH = env("CI_WORKFLOW_PATH", required=False)
 DRIFT_LABEL = env("DRIFT_LABEL", required=False)
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
 def _require_runtime_env() -> None:
    """Enforce env contract — called from `main()` only. Tests import
    individual functions without setting the full env contract."""
    for key in (
        "GITEA_TOKEN",
        "GITEA_HOST",
        "REPO",
        "BRANCHES",
        "SENTINEL_JOB",
        "AUDIT_WORKFLOW_PATH",
        "CI_WORKFLOW_PATH",
        "DRIFT_LABEL",
    ):
        if not os.environ.get(key):
            sys.stderr.write(f"::error::missing required env var: {key}\n")
            sys.exit(2)
 # --------------------------------------------------------------------------
 # Tiny HTTP helper (no requests dependency)
 # --------------------------------------------------------------------------
 class ApiError(RuntimeError):
    """Raised when a Gitea API call cannot be trusted to have succeeded.
    Covers non-2xx HTTP status AND 2xx with an unparseable JSON body on
    endpoints that are documented to return JSON (search/read). Callers
    that swallow this and proceed would risk e.g. creating duplicate
    `[ci-drift]` issues when a transient 500 hides an existing match.
    The cron retries hourly; one fail-loud cycle is fine — silent
    duplicate creation is not (per Five-Axis review on PR #112).
    """
 def api(
    method: str,
    path: str,
    *,
    body: dict | None = None,
    query: dict[str, str] | None = None,
    expect_json: bool = True,
 ) -> tuple[int, Any]:
    """Tiny HTTP helper around urllib.
    Raises ApiError on any non-2xx response. Callers that want
    best-effort semantics (e.g. label-apply) must `try/except ApiError`
    explicitly — making the failure-soft path opt-in rather than the
    default closes the duplicate-issue regression class.
    For 2xx responses with a JSON body that fails to parse, raises
    ApiError when `expect_json=True` (the default for read-shaped
    paths). On endpoints that legitimately return non-JSON success
    bodies (e.g. some Gitea create echoes — see
    `feedback_gitea_create_api_unparseable_response`), callers may pass
    `expect_json=False` to accept a `_raw` fallthrough — but they MUST
    then verify success via a follow-up GET, not by trusting the body.
    """
    url = f"{API}{path}"
    if query:
        url = f"{url}?{urllib.parse.urlencode(query)}"
    data = None
    headers = {
        "Authorization": f"token {GITEA_TOKEN}",
        "Accept": "application/json",
    }
    if body is not None:
        data = json.dumps(body).encode("utf-8")
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, method=method, data=data, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = resp.read()
            status = resp.status
    except urllib.error.HTTPError as e:
        raw = e.read()
        status = e.code
    if not (200 <= status < 300):
        snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
        raise ApiError(
            f"{method} {path} → HTTP {status}: {snippet}"
        )
    if not raw:
        return status, None
    try:
        return status, json.loads(raw)
    except json.JSONDecodeError as e:
        if expect_json:
            raise ApiError(
                f"{method} {path} → HTTP {status} but body is not JSON: {e}"
            ) from e
        # Opt-in raw fallthrough for endpoints with known echo-quirks.
        return status, {"_raw": raw.decode("utf-8", errors="replace")}
 # --------------------------------------------------------------------------
 # YAML loaders — STRICT (reject GitHub-Actions-only syntax)
 # --------------------------------------------------------------------------
 def load_yaml(path: str) -> dict:
    """Load + parse a workflow YAML. Hard-fail if the file is missing
    or doesn't parse — drift-detect cannot make decisions without
    knowing the actual job set."""
    if not os.path.exists(path):
        sys.stderr.write(f"::error::file not found: {path}\n")
        sys.exit(3)
    with open(path, encoding="utf-8") as f:
        try:
            doc = yaml.safe_load(f)
        except yaml.YAMLError as e:
            sys.stderr.write(f"::error::YAML parse error in {path}: {e}\n")
            sys.exit(3)
    if not isinstance(doc, dict):
        sys.stderr.write(f"::error::{path} is not a YAML mapping\n")
        sys.exit(3)
    return doc
 def ci_jobs_all(ci_doc: dict) -> set[str]:
    """Every job key in ci.yml minus the sentinel itself. Used for F1b
    (sentinel.needs typo check) — needs that name a non-existent job
    is a typo regardless of event-gating."""
    jobs = ci_doc.get("jobs")
    if not isinstance(jobs, dict):
        sys.stderr.write("::error::ci.yml has no jobs: mapping\n")
        sys.exit(3)
    return {k for k in jobs if k != SENTINEL_JOB}
 def ci_job_names(ci_doc: dict) -> set[str]:
    """Set of job keys in ci.yml MINUS the sentinel itself MINUS jobs
    whose `if:` gates on `github.event_name` (those are event-scoped
    and can legitimately be `skipped` for a given trigger; if we
    required them under the sentinel `needs:`, every PR-only job
    would be `skipped` on push and the sentinel would interpret
    `skipped != success` as failure). RFC §4 spec.
    Used for F1 (jobs missing from sentinel needs). NOT used for F1b
    (typos in needs) — see `ci_jobs_all` for that."""
    jobs = ci_doc.get("jobs")
    if not isinstance(jobs, dict):
        sys.stderr.write("::error::ci.yml has no jobs: mapping\n")
        sys.exit(3)
    names: set[str] = set()
    for k, v in jobs.items():
        if k == SENTINEL_JOB:
            continue
        if isinstance(v, dict):
            gate = v.get("if")
            if isinstance(gate, str) and "github.event_name" in gate:
                continue
        names.add(k)
    return names
 def sentinel_needs(ci_doc: dict) -> set[str]:
    sentinel = ci_doc.get("jobs", {}).get(SENTINEL_JOB)
    if not isinstance(sentinel, dict):
        sys.stderr.write(
            f"::error::sentinel job '{SENTINEL_JOB}' not found in {CI_WORKFLOW_PATH}\n"
        )
        sys.exit(3)
    needs = sentinel.get("needs", [])
    if isinstance(needs, str):
        needs = [needs]
    if not isinstance(needs, list):
        sys.stderr.write("::error::sentinel `needs:` is neither list nor string\n")
        sys.exit(3)
    return set(needs)
 def required_checks_env(audit_doc: dict) -> set[str]:
    """Pull the REQUIRED_CHECKS env value from audit-force-merge.yml.
    Walks the YAML AST per `feedback_behavior_based_ast_gates`: we do
    NOT grep for `REQUIRED_CHECKS:` — that breaks under reformatting,
    multi-job workflows, or a future move of the env to a different
    step. Instead, look inside every job's every step's `env:` map."""
    found: list[str] = []
    jobs = audit_doc.get("jobs", {})
    if not isinstance(jobs, dict):
        sys.stderr.write(f"::warning::{AUDIT_WORKFLOW_PATH} has no jobs: mapping\n")
        return set()
    for job in jobs.values():
        if not isinstance(job, dict):
            continue
        for step in job.get("steps", []) or []:
            if not isinstance(step, dict):
                continue
            step_env = step.get("env") or {}
            if isinstance(step_env, dict) and "REQUIRED_CHECKS" in step_env:
                v = step_env["REQUIRED_CHECKS"]
                if isinstance(v, str):
                    found.append(v)
    if not found:
        sys.stderr.write(
            f"::error::REQUIRED_CHECKS env not found in any step of {AUDIT_WORKFLOW_PATH}\n"
        )
        sys.exit(3)
    if len(found) > 1:
        # Defensive: refuse to guess which one is canonical.
        sys.stderr.write(
            f"::error::REQUIRED_CHECKS env present in {len(found)} steps; ambiguous\n"
        )
        sys.exit(3)
    raw = found[0]
    # YAML block-scalars (`|`) leave a trailing newline + blanks; trim
    # consistently with audit-force-merge.sh's parser so both sides
    # produce identical sets.
    return {line.strip() for line in raw.splitlines() if line.strip()}
 # --------------------------------------------------------------------------
 # Mapping: ci.yml job-key  →  protection context name
 # --------------------------------------------------------------------------
 def expected_context(job_key: str, workflow_name: str = "ci") -> str:
    """Gitea Actions reports status-check contexts as
       "{workflow.name} / {job.name or job.key} ({event})".
    For ci.yml the event is `pull_request` on PRs (that's what
    `status_check_contexts` records). Job.name defaults to job.key
    when no `name:` is set. CP's ci.yml does NOT set per-job `name:`
    so the key equals the human-name."""
    return f"{workflow_name} / {job_key} (pull_request)"
 # --------------------------------------------------------------------------
 # Drift detection
 # --------------------------------------------------------------------------
 def detect_drift(branch: str) -> tuple[list[str], dict]:
    """Returns (findings, debug). Empty findings == no drift.
    Raises:
        ApiError: propagated from the protection fetch only when the
                  failure is likely a transient Gitea outage (5xx).
                  403/404 from the protection endpoint is treated as
                  "cannot determine drift for this branch" — a token-
                  scope issue (missing repo-admin on DRIFT_BOT_TOKEN) or
                  a repo with no protection set should not turn the
                  hourly cron red. The workflow continues to the next
                  branch; no [ci-drift] issue is filed for a branch
                  whose protection cannot be read.
    """
    findings: list[str] = []
    ci_doc = load_yaml(CI_WORKFLOW_PATH)
    audit_doc = load_yaml(AUDIT_WORKFLOW_PATH)
    jobs = ci_job_names(ci_doc)
    jobs_all = ci_jobs_all(ci_doc)
    needs = sentinel_needs(ci_doc)
    env_set = required_checks_env(audit_doc)
    # Protection
    # api() raises ApiError on non-2xx. Transient 5xx should fail loud.
    # 403/404 means the token lacks repo-admin scope (Gitea 1.22.6's
    # branch_protections endpoint requires it — see DRIFT_BOT_TOKEN
    # provisioning trail in ci-required-drift.yml). Treat as
    # "cannot determine drift for this branch" — skip without turning
    # the workflow red. Surface a clear diagnostic so the operator
    # knows what to fix.
    contexts: set[str] = set()
    protection_path = f"/repos/{OWNER}/{NAME}/branch_protections/{branch}"
    try:
        _, protection = api("GET", protection_path)
    except ApiError as e:
        # Isolate the HTTP status from the error message.
        http_status: int | None = None
        msg = str(e)
        # ApiError message format: "{method} {path} → HTTP {status}: {body}"
        import re as _re
        m = _re.search(r"HTTP (\d{3})", msg)
        if m:
            http_status = int(m.group(1))
        if http_status in (403, 404):
            # Token lacks scope OR branch has no protection. Cannot
            # determine drift — skip this branch. Do NOT exit non-zero;
            # the issue IS the alarm, not a red workflow.
            sys.stderr.write(
                f"::error::GET {protection_path} returned HTTP {http_status} — "
                f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
                f"requires it for this endpoint) OR branch has no protection "
                f"configured. Cannot determine drift for {branch}; "
                f"skipping. Fix: grant repo-admin to mc-drift-bot or "
                f"configure protection on {branch}.\n"
            )
            debug = {
                "branch": branch,
                "ci_jobs": sorted(jobs),
                "sentinel_needs": sorted(needs),
                "protection_contexts_skipped": True,
                "protection_http_status": http_status,
                "audit_env_checks": sorted(env_set),
            }
            return [], debug
        # 5xx — propagate (transient outage, fail loud per design).
        raise
    if not isinstance(protection, dict):
        sys.stderr.write(
            f"::error::protection response for {branch} not a JSON object\n"
        )
        sys.exit(4)
    contexts = set(protection.get("status_check_contexts") or [])
    # ----- F1: job exists in CI but not under sentinel.needs -----
    missing_from_needs = sorted(jobs - needs)
    if missing_from_needs:
        findings.append(
            "F1 — jobs in ci.yml NOT under sentinel `needs:` (sentinel doesn't gate them):\n"
            + "\n".join(f"  - {n}" for n in missing_from_needs)
        )
    # ----- F1b: needs lists a job that doesn't exist (typo) -----
    # Compare against jobs_all (incl. event-gated jobs); a typo is a
    # typo regardless of `if:` gating.
    stale_needs = sorted(needs - jobs_all)
    if stale_needs:
        findings.append(
            "F1b — sentinel `needs:` lists jobs NOT present in ci.yml (typo or removed job):\n"
            + "\n".join(f"  - {n}" for n in stale_needs)
        )
    # ----- F2: protection context has no emitting job -----
    # Compute the contexts the CI YAML actually produces. The sentinel
    # is in (B) intentionally (`ci / all-required (pull_request)`); we
    # whitelist it explicitly.
    emitted_contexts = {expected_context(j) for j in jobs} | {expected_context(SENTINEL_JOB)}
    # Contexts NOT produced by ci.yml may still come from other
    # workflows in the repo (Secret scan etc). We can't enumerate
    # every workflow's emissions cheaply; instead, flag only contexts
    # whose prefix is `ci / ` (this workflow's emissions) and which
    # don't appear in `emitted_contexts`. This narrows F2 to the
    # failure class the RFC actually targets without producing noise
    # from cross-workflow emitters.
    stale_protection = sorted(
        c for c in contexts if c.startswith("ci / ") and c not in emitted_contexts
    )
    if stale_protection:
        findings.append(
            "F2 — protection `status_check_contexts` entries with `ci / ` prefix that NO "
            "job in ci.yml emits (stale name → silent advisory gate):\n"
            + "\n".join(f"  - {c}" for c in stale_protection)
        )
    # ----- F3: audit env vs protection contexts (set-equal) -----
    only_in_env = sorted(env_set - contexts)
    only_in_protection = sorted(contexts - env_set)
    if only_in_env:
        findings.append(
            "F3a — audit-force-merge.yml `REQUIRED_CHECKS` env has contexts NOT in "
            f"branch_protections/{branch}.status_check_contexts (audit would flag "
            "non-force-merges as force):\n"
            + "\n".join(f"  - {c}" for c in only_in_env)
        )
    if only_in_protection:
        findings.append(
            "F3b — branch_protections/{br}.status_check_contexts has contexts NOT in "
            "audit-force-merge.yml `REQUIRED_CHECKS` env (real force-merges would be "
            "missed):\n".format(br=branch)
            + "\n".join(f"  - {c}" for c in only_in_protection)
        )
    debug = {
        "branch": branch,
        "ci_jobs": sorted(jobs),
        "sentinel_needs": sorted(needs),
        "protection_contexts": sorted(contexts),
        "audit_env_checks": sorted(env_set),
        "expected_contexts": sorted(emitted_contexts),
    }
    return findings, debug
 # --------------------------------------------------------------------------
 # Issue file/update
 # --------------------------------------------------------------------------
 def title_for(branch: str) -> str:
    # Idempotency key — keep stable, never include timestamp/SHA.
    return f"[ci-drift] {REPO}/{branch}: required-checks divergence detected"
 def find_open_issue(title: str) -> dict | None:
    """Return the existing open `[ci-drift]` issue for `title`, or None.
    `None` means "search succeeded, no match" — NOT "search failed".
    Per Five-Axis review on PR #112: returning None on a transient API
    error caused the caller to POST a duplicate issue. Now api() raises
    ApiError on any non-2xx; we let it propagate. The cron retries
    hourly; failing one cycle loudly is strictly better than silently
    duplicating.
    Gitea issue search returns at most page=50 per page; one page is
    enough as long as `[ci-drift]` issues are a tiny minority. (See
    follow-up issue for Link-header pagination.)
    """
    _, results = api(
        "GET",
        f"/repos/{OWNER}/{NAME}/issues",
        query={"state": "open", "type": "issues", "limit": "50"},
    )
    if not isinstance(results, list):
        raise ApiError(
            f"issue search returned non-list body (got {type(results).__name__})"
        )
    for issue in results:
        if issue.get("title") == title:
            return issue
    return None
 def render_body(branch: str, findings: list[str], debug: dict) -> str:
    body = [
        f"# Drift detected on `{REPO}/{branch}`",
        "",
        "Auto-filed by `.gitea/workflows/ci-required-drift.yml` "
        "(RFC [internal#219](https://git.moleculesai.app/molecule-ai/internal/issues/219) §4 + §6).",
        "",
        "## Findings",
        "",
    ]
    body.extend(findings)
    body.extend(
        [
            "",
            "## Resolution",
            "",
            "- **F1 / F1b**: add the missing job to `all-required.needs:` "
            "in `.gitea/workflows/ci.yml`, or remove the stale entry.",
            "- **F2**: rename the protection context to match an emitter, "
            "or remove it from `status_check_contexts` "
            "(PATCH `/api/v1/repos/{owner}/{repo}/branch_protections/{branch}`).",
            "- **F3a / F3b**: bring `REQUIRED_CHECKS` env in "
            "`.gitea/workflows/audit-force-merge.yml` into set-equality with "
            "`status_check_contexts` (single PR, both files).",
            "",
            "## Debug",
            "",
            "```json",
            json.dumps(debug, indent=2, sort_keys=True),
            "```",
            "",
            "_This issue is idempotent: drift-detect runs hourly at `:17` "
            "and edits this body in place. Close the issue once the drift "
            "is fixed; the next hourly run will reopen if drift returns._",
        ]
    )
    return "\n".join(body)
 def file_or_update(
    branch: str,
    findings: list[str],
    debug: dict,
    *,
    dry_run: bool = False,
 ) -> None:
    """File a new `[ci-drift]` issue, or PATCH the existing one in place.
    `dry_run=True` skips every side-effecting Gitea call (issue
    search, POST, PATCH, label apply) and prints the would-be issue
    title + body to stdout. Useful for local testing and for
    debugging drift output without polluting the issue tracker.
    """
    title = title_for(branch)
    body = render_body(branch, findings, debug)
    if dry_run:
        print(f"::notice::[dry-run] would file/update drift issue for {branch}")
        print(f"::group::[dry-run] title")
        print(title)
        print(f"::endgroup::")
        print(f"::group::[dry-run] body")
        print(body)
        print(f"::endgroup::")
        return
    existing = find_open_issue(title)
    if existing:
        num = existing["number"]
        api(
            "PATCH",
            f"/repos/{OWNER}/{NAME}/issues/{num}",
            body={"body": body},
        )
        print(f"::notice::Updated existing drift issue #{num} for {branch}")
        return
    _, created = api(
        "POST",
        f"/repos/{OWNER}/{NAME}/issues",
        body={"title": title, "body": body, "labels": []},
    )
    if not isinstance(created, dict):
        sys.stderr.write("::error::POST issue response not a JSON object\n")
        sys.exit(5)
    new_num = created.get("number")
    print(f"::warning::Filed new drift issue #{new_num} for {branch}")
    # Apply label by name (Gitea's add-labels endpoint accepts label IDs;
    # look up id by name once). Best-effort: failure to label is logged
    # but does not fail the audit run — the issue itself IS the alarm.
    try:
        _, labels = api("GET", f"/repos/{OWNER}/{NAME}/labels")
    except ApiError as e:
        sys.stderr.write(f"::warning::could not list labels: {e}\n")
        return
    label_id = None
    if isinstance(labels, list):
        for lbl in labels:
            if lbl.get("name") == DRIFT_LABEL:
                label_id = lbl.get("id")
                break
    if label_id is not None and new_num:
        try:
            api(
                "POST",
                f"/repos/{OWNER}/{NAME}/issues/{new_num}/labels",
                body={"labels": [label_id]},
            )
        except ApiError as e:
            sys.stderr.write(
                f"::warning::could not apply label '{DRIFT_LABEL}' to #{new_num}: {e}\n"
            )
    else:
        sys.stderr.write(f"::warning::label '{DRIFT_LABEL}' not found on repo\n")
 # --------------------------------------------------------------------------
 # Main
 # --------------------------------------------------------------------------
 def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog="ci-required-drift",
        description="Detect drift between ci.yml, branch_protections, "
        "and audit-force-merge.yml REQUIRED_CHECKS env.",
    )
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Detect + print findings to stdout; do NOT file or PATCH "
        "the `[ci-drift]` issue. Useful for local testing and for "
        "previewing output before turning the workflow loose.",
    )
    return p.parse_args(argv)
 def main(argv: list[str] | None = None) -> int:
    args = _parse_args(argv)
    _require_runtime_env()
    for branch in BRANCHES:
        findings, debug = detect_drift(branch)
        if findings:
            print(f"::warning::Drift detected on {branch}:")
            for f in findings:
                print(f)
            file_or_update(branch, findings, debug, dry_run=args.dry_run)
        else:
            print(f"::notice::No drift on {branch}.")
            print(json.dumps(debug, indent=2, sort_keys=True))
    # Exit 0 even on drift — the issue IS the alarm, not a red workflow.
    # A red workflow here would page on a CI rename until the issue is
    # opened, doubling the noise. The issue itself is the actionable
    # surface. (`api()` raising ApiError is the only path that exits
    # non-zero, by design: a transient Gitea outage should fail loudly.)
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/compare-api-diff-files.py
+++ b/.gitea/scripts/compare-api-diff-files.py
@ -1,40 +0,0 @@
 #!/usr/bin/env python3
 """Extract changed-file list from Gitea Compare API JSON response.
 Gitea Compare API returns changed files nested inside commits, not at the
 top level:
    {"commits": [{"files": [{"filename": "path/to/file"}]}]}
 Usage:
    compare-api-diff-files.py < API_RESPONSE.json
 Exits 0 with filenames on stdout, one per line.
 Exits 1 on malformed input (caller should handle as "no files").
 """
 from __future__ import annotations
 import sys
 import json
 def main() -> None:
    try:
        data = json.load(sys.stdin)
    except Exception:
        sys.exit(1)
    filenames: list[str] = []
    for commit in data.get("commits", []):
        for f in commit.get("files", []):
            fn = f.get("filename", "")
            if fn:
                filenames.append(fn)
    if filenames:
        sys.stdout.write("\n".join(filenames))
        sys.stdout.write("\n")
    # else: empty stdout = no files, caller treats as empty list
 if __name__ == "__main__":
    main()
--- a/.gitea/scripts/lint-required-no-paths.py
+++ b/.gitea/scripts/lint-required-no-paths.py
@ -1,404 +0,0 @@
 #!/usr/bin/env python3
 """lint-required-no-paths — structural enforcement of
 `feedback_path_filtered_workflow_cant_be_required`.
 For every workflow whose status-check context appears in
 `branch_protections/<branch>.status_check_contexts`, assert that the
 workflow's `on:` block has NO `paths:` and NO `paths-ignore:` filter.
 A required-check workflow with a paths filter silently degrades the
 merge gate:
  - If the PR's diff doesn't match the `paths:` glob, the workflow
    never fires.
  - Gitea (1.22.6) reports the required context as `pending` (never as
    `skipped == success`), so the PR cannot merge.
  - For a docs-only PR against `paths: ['**.go']`, the PR is
    blocked forever — no human action can produce a green.
 The class was previously prevented only by reviewer vigilance + the
 saved memory `feedback_path_filtered_workflow_cant_be_required`. This
 script makes it a hard CI gate so a future PR adding `paths:` to a
 required workflow fails fast at PR time, not after merge when the next
 docs PR wedges main.
 The lint runs as `.gitea/workflows/lint-required-no-paths.yml` on every
 PR. The lint workflow ITSELF must not have a paths-filter (otherwise it
 could be circumvented by a paths-non-matching PR) — that's enforced by
 self-reference and by the workflow's own `on:` block deliberately
 omitting filters.
 Sources of truth:
  - `branch_protections/<branch>` `status_check_contexts` (the merge gate)
  - `.gitea/workflows/*.yml` `name:` + `on:` (the workflow set)
 Context-format note (Gitea 1.22.6):
  Status-check contexts are formatted `{workflow_name} / {job_name_or_key} ({event})`.
  We parse the workflow_name prefix and walk `.gitea/workflows/*.yml` for
  a file whose `name:` attr matches. (The filename is NOT the source of
  truth; `name:` is, because Gitea formats the context from `name:`.)
 Exit codes:
  0 — no required workflow has a paths/paths-ignore filter (clean) OR
      branch_protections endpoint returned 403/404 (token-scope issue;
      surfaced via ::error:: but non-fatal so a missing scope doesn't
      red-X every PR — fix the token, not the lint).
  1 — at least one required workflow has a paths/paths-ignore filter
      (the gate-degrading defect class).
  2 — env contract violation (missing GITEA_TOKEN/HOST/REPO/BRANCH).
  3 — workflows directory missing or workflow YAML unparseable.
  4 — protection response shape unexpected (non-dict body on 2xx).
 Auth note: `GET /repos/.../branch_protections/{branch}` requires
 repo-admin role in Gitea 1.22.6. The workflow-default `GITHUB_TOKEN`
 is non-admin; we re-use `DRIFT_BOT_TOKEN` (same persona that powers
 ci-required-drift.yml). If `DRIFT_BOT_TOKEN` is unavailable in a future
 context, the script falls through gracefully (exit 0 + ::error::).
 """
 from __future__ import annotations
 import json
 import os
 import re
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from pathlib import Path
 from typing import Any
 import yaml  # PyYAML 6.0.2 — installed by the workflow before this runs.
 # --------------------------------------------------------------------------
 # Environment
 # --------------------------------------------------------------------------
 def _env(key: str, *, required: bool = True, default: str | None = None) -> str:
    val = os.environ.get(key, default)
    if required and not val:
        sys.stderr.write(f"::error::missing required env var: {key}\n")
        sys.exit(2)
    return val or ""
 GITEA_TOKEN = _env("GITEA_TOKEN", required=False)
 GITEA_HOST = _env("GITEA_HOST", required=False)
 REPO = _env("REPO", required=False)
 BRANCH = _env("BRANCH", required=False, default="main")
 WORKFLOWS_DIR = _env(
    "WORKFLOWS_DIR", required=False, default=".gitea/workflows"
 )
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
 def _require_runtime_env() -> None:
    """Enforce env contract — called from `run()` only. Tests import
    individual functions without setting the full env contract."""
    for key in ("GITEA_TOKEN", "GITEA_HOST", "REPO", "BRANCH"):
        if not os.environ.get(key):
            sys.stderr.write(f"::error::missing required env var: {key}\n")
            sys.exit(2)
 # --------------------------------------------------------------------------
 # Tiny HTTP helper (mirrors ci-required-drift.py contract:
 # raise on non-2xx and on JSON-decode-fail when JSON expected, per
 # `feedback_api_helper_must_raise_not_return_dict`).
 # --------------------------------------------------------------------------
 class ApiError(RuntimeError):
    """Raised when a Gitea API call cannot be trusted to have succeeded."""
 def api(
    method: str,
    path: str,
    *,
    body: dict | None = None,
    query: dict[str, str] | None = None,
    expect_json: bool = True,
 ) -> tuple[int, Any]:
    url = f"{API}{path}"
    if query:
        url = f"{url}?{urllib.parse.urlencode(query)}"
    data = None
    headers = {
        "Authorization": f"token {GITEA_TOKEN}",
        "Accept": "application/json",
    }
    if body is not None:
        data = json.dumps(body).encode("utf-8")
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, method=method, data=data, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = resp.read()
            status = resp.status
    except urllib.error.HTTPError as e:
        raw = e.read()
        status = e.code
    if not (200 <= status < 300):
        snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
        raise ApiError(f"{method} {path} → HTTP {status}: {snippet}")
    if not raw:
        return status, None
    try:
        return status, json.loads(raw)
    except json.JSONDecodeError as e:
        if expect_json:
            raise ApiError(
                f"{method} {path} → HTTP {status} but body is not JSON: {e}"
            ) from e
        return status, {"_raw": raw.decode("utf-8", errors="replace")}
 # --------------------------------------------------------------------------
 # Status-check context parser
 # --------------------------------------------------------------------------
 # Format: "<workflow_name> / <job_name_or_key> (<event>)"
 # Examples observed on molecule-core/main:
 #   "Secret scan / Scan diff for credential-shaped strings (pull_request)"
 #   "sop-tier-check / tier-check (pull_request)"
 #
 # Split strategy: peel off the trailing ` (<event>)` first, then split
 # the leading `<workflow> / <rest>` on the FIRST ` / ` (workflow names
 # come from `name:` attrs which conventionally don't embed ' / '; job
 # names CAN, so we keep the rest of the slash-divided text as the job
 # name). This matches Gitea's `name: ` semantics.
 _CONTEXT_RE = re.compile(r"^(?P<workflow>.+?) / (?P<job>.+) \((?P<event>[^)]+)\)$")
 def parse_context(ctx: str) -> tuple[str, str, str] | None:
    """Parse `<workflow> / <job> (<event>)` → (workflow, job, event) or None."""
    if not ctx:
        return None
    m = _CONTEXT_RE.match(ctx)
    if not m:
        return None
    return m.group("workflow"), m.group("job"), m.group("event")
 # --------------------------------------------------------------------------
 # workflow-name → file resolution
 # --------------------------------------------------------------------------
 def _iter_workflow_files() -> list[Path]:
    d = Path(WORKFLOWS_DIR)
    if not d.is_dir():
        sys.stderr.write(f"::error::workflows directory not found: {d}\n")
        sys.exit(3)
    # `.yml` and `.yaml` — Gitea accepts both (rarely used `.yaml`, but
    # don't silently miss it if a future port uses it).
    return sorted(list(d.glob("*.yml")) + list(d.glob("*.yaml")))
 def resolve_workflow_file(workflow_name: str) -> Path | None:
    """Find the YAML file whose `name:` attr matches `workflow_name`.
    Returns None if no match. Filename is NOT used as a fallback —
    Gitea's context format uses `name:`, so a `name:`-less workflow
    won't even appear in the protection list. (A YAML with no `name:`
    would default the context to the file basename, but our protection
    contexts on molecule-core are all `name:`-derived; we trust the
    same.)
    """
    for f in _iter_workflow_files():
        try:
            doc = yaml.safe_load(f.read_text(encoding="utf-8"))
        except yaml.YAMLError as e:
            sys.stderr.write(f"::error::YAML parse error in {f}: {e}\n")
            sys.exit(3)
        if isinstance(doc, dict) and doc.get("name") == workflow_name:
            return f
    return None
 # --------------------------------------------------------------------------
 # paths-filter detection
 # --------------------------------------------------------------------------
 # Triggers that accept `paths:` / `paths-ignore:` (per GitHub Actions /
 # Gitea Actions docs): pull_request, pull_request_target, push.
 # We don't enumerate — any sub-key named `paths` or `paths-ignore`
 # inside an event mapping is flagged.
 _PATHS_KEYS = ("paths", "paths-ignore")
 def detect_paths_filters(workflow_path: Path) -> list[str]:
    """Walk the workflow's `on:` block and return a list of findings, one
    per offending `paths`/`paths-ignore` key.
    Returns:
        Empty list if the workflow has no paths/paths-ignore filter
        anywhere in its `on:` block. Otherwise, a list of human-readable
        strings naming the event and filter key + the filter contents.
    """
    try:
        doc = yaml.safe_load(workflow_path.read_text(encoding="utf-8"))
    except yaml.YAMLError as e:
        sys.stderr.write(f"::error::YAML parse error in {workflow_path}: {e}\n")
        sys.exit(3)
    if not isinstance(doc, dict):
        return []
    on_block = doc.get("on") or doc.get(True)  # PyYAML 6 quirk: `on:`
    # under default constructor sometimes becomes the bool key `True`
    # because YAML 1.1 treats `on` as a boolean. Tolerate both.
    if on_block is None:
        return []
    findings: list[str] = []
    # Shape A: `on: pull_request` (string shorthand) — cannot carry filters.
    if isinstance(on_block, str):
        return []
    # Shape B: `on: [pull_request, push]` (list shorthand) — cannot carry filters.
    if isinstance(on_block, list):
        return []
    # Shape C: `on: { event: { ... } }` — the standard mapping case.
    if isinstance(on_block, dict):
        # Defensive: top-level malformed `on.paths` (someone wrote
        # `on: { paths: ['x'] }` thinking it's a workflow-level filter).
        # This is invalid syntax, but if present, flag it — it might
        # not block the workflow from registering (Gitea may ignore the
        # unknown key) and would create a false sense of "filter exists"
        # the lint should still surface.
        for k in _PATHS_KEYS:
            if k in on_block:
                v = on_block[k]
                findings.append(
                    f"top-level `on.{k}` filter (malformed but present): {v!r}"
                )
        for event, event_body in on_block.items():
            if event in _PATHS_KEYS:
                continue  # already handled above
            if not isinstance(event_body, dict):
                # `pull_request: null` / `pull_request: [opened]` shapes —
                # no place for a paths filter to live; skip.
                continue
            for k in _PATHS_KEYS:
                if k in event_body:
                    v = event_body[k]
                    findings.append(
                        f"`on.{event}.{k}` filter present: {v!r}"
                    )
    return findings
 # --------------------------------------------------------------------------
 # Driver
 # --------------------------------------------------------------------------
 def run() -> int:
    """Main lint entrypoint. Returns the process exit code.
    Exit semantics (see module docstring for full table):
      0 — clean (no offending paths-filter on any required workflow),
          OR protection unreadable (403/404) — surfaced as ::error::
          but treated as non-fatal so token-scope issues don't red-X
          every PR.
      1 — at least one required workflow carries a paths/paths-ignore
          filter — the regression class this lint exists to prevent.
    """
    _require_runtime_env()
    protection_path = f"/repos/{OWNER}/{NAME}/branch_protections/{BRANCH}"
    try:
        _, protection = api("GET", protection_path)
    except ApiError as e:
        msg = str(e)
        m = re.search(r"HTTP (\d{3})", msg)
        http_status = int(m.group(1)) if m else None
        if http_status in (403, 404):
            sys.stderr.write(
                f"::error::GET {protection_path} returned HTTP {http_status} — "
                f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
                f"requires it for this endpoint) OR branch '{BRANCH}' has "
                f"no protection configured. Cannot enumerate required "
                f"checks; skipping lint with exit 0 to avoid red-X on "
                f"every PR. Fix: grant repo-admin to mc-drift-bot.\n"
            )
            return 0
        raise
    if not isinstance(protection, dict):
        sys.stderr.write(
            f"::error::protection response for {BRANCH} not a JSON object\n"
        )
        return 4
    contexts: list[str] = list(protection.get("status_check_contexts") or [])
    if not contexts:
        print(
            f"::notice::branch_protections/{BRANCH} has 0 required "
            f"status_check_contexts; nothing to lint. (no required contexts)"
        )
        return 0
    print(f"::notice::Linting {len(contexts)} required context(s) for paths-filter regressions:")
    for c in contexts:
        print(f"  - {c}")
    offenders: list[tuple[str, Path, list[str]]] = []
    unresolved: list[str] = []
    for ctx in contexts:
        parsed = parse_context(ctx)
        if parsed is None:
            print(
                f"::warning::could not parse context '{ctx}' "
                f"(expected `<workflow> / <job> (<event>)`); skipping"
            )
            unresolved.append(ctx)
            continue
        workflow_name, _job, _event = parsed
        wf_path = resolve_workflow_file(workflow_name)
        if wf_path is None:
            print(
                f"::warning::no workflow file in {WORKFLOWS_DIR} has "
                f"`name: {workflow_name}` (required context '{ctx}'); "
                f"skipping paths-filter check. "
                f"(orphaned-context detection is ci-required-drift's job.)"
            )
            unresolved.append(ctx)
            continue
        findings = detect_paths_filters(wf_path)
        if findings:
            offenders.append((workflow_name, wf_path, findings))
        else:
            print(f"::notice::OK {wf_path.name} ({workflow_name}) — no paths filter")
    if offenders:
        print("")
        print(f"::error::Found {len(offenders)} required workflow(s) with paths/paths-ignore filters:")
        for workflow_name, wf_path, findings in offenders:
            for finding in findings:
                # ::error file=... lets Gitea Actions surface a per-file
                # annotation in the PR UI (when annotations are wired).
                print(
                    f"::error file={wf_path}::Required workflow "
                    f"'{workflow_name}' ({wf_path.name}) has a paths "
                    f"filter that would degrade the merge gate to a "
                    f"silent indefinite pending: {finding}. "
                    f"See feedback_path_filtered_workflow_cant_be_required. "
                    f"Fix: remove the filter and instead gate per-step "
                    f"inside the job with `if: contains(steps.changed.outputs.files, ...)` "
                    f"or refactor to a single-job-with-per-step-if shape."
                )
        return 1
    print("")
    print(
        f"::notice::OK — all {len(contexts) - len(unresolved)} resolvable "
        f"required workflow(s) clean (no paths/paths-ignore filters)."
    )
    if unresolved:
        print(
            f"::notice::{len(unresolved)} required context(s) were not "
            f"resolved to a workflow file (warn-not-fail); see warnings above."
        )
    return 0
 if __name__ == "__main__":
    sys.exit(run())
--- a/.gitea/scripts/lint-workflow-yaml.py
+++ b/.gitea/scripts/lint-workflow-yaml.py
@ -1,369 +0,0 @@
 #!/usr/bin/env python3
 """lint-workflow-yaml — catch Gitea-1.22.6-hostile workflow YAML shapes.
 This script enforces six structural rules that have historically caused
 silent CI failures on Gitea Actions (1.22.6) — workflows that the server's
 YAML parser rejects with `[W] ignore invalid workflow ...` and registers
 for zero events, or shape conventions that produce ambiguous status
 contexts. Each rule maps to a documented incident in saved memory.
 Rules (4 fatal + 1 fatal cross-file + 1 heuristic-warn):
  1. `workflow_dispatch.inputs:` block — Gitea 1.22.6 mis-parses the
     `inputs` keys as sibling event types and rejects the whole file.
     Memory: feedback_gitea_workflow_dispatch_inputs_unsupported.
     Origin: 2026-05-11 PyPI freeze (publish-runtime).
  2. `on: workflow_run:` event — not enumerated in Gitea 1.22.6's
     supported event list (verified via modules/actions/workflows.go
     enumeration; task #81). Workflow registers, fires for 0 events.
  3. `name:` containing `/` — breaks the
     `<workflow> / <job> (<event>)` commit-status context convention;
     downstream parsers (sop-tier-check, status-reaper) tokenize on `/`.
  4. `name:` collision across files — Gitea routes commit-status updates
     by `name` and behavior on collision is undefined (status-reaper
     rev1 fail-loud).
  5. Cross-repo `uses: org/repo/path@ref` — blocked while
     `[actions].DEFAULT_ACTIONS_URL=github` is the server default;
     resolves to github.com/<org-suspended>/... and 404s.
     Memory: feedback_gitea_cross_repo_uses_blocked. Cross-link: task #109.
  6. (HEURISTIC, warn-not-fail) Steps reference `https://api.github.com`
     or `https://github.com/.../releases/download` without a
     workflow-level `env.GITHUB_SERVER_URL` set to the Gitea instance.
     Memory: feedback_act_runner_github_server_url.
 Per `feedback_smoke_test_vendor_truth_not_shape_match`: fixtures used to
 validate this lint must mirror real Gitea 1.22.6 YAML semantics, not
 Python yaml-parser quirks. The test suite at tests/test_lint_workflow_yaml.py
 includes a vendor-truth fixture (the exact publish-runtime regression).
 Usage:
  python3 .gitea/scripts/lint-workflow-yaml.py
    Lint every `*.yml` in `.gitea/workflows/`.
  python3 .gitea/scripts/lint-workflow-yaml.py --workflow-dir <path>
    Lint a custom directory (used by tests/test_lint_workflow_yaml.py).
 Exit codes:
  0 — clean OR only heuristic-warnings emitted.
  1 — at least one fatal rule (1-5) violated.
  2 — YAML parse error or argv usage error.
 """
 from __future__ import annotations
 import argparse
 import collections
 import glob
 import os
 import re
 import sys
 from pathlib import Path
 from typing import Any, Iterable
 try:
    import yaml
 except ImportError:
    print("::error::PyYAML is required. Install with: pip install PyYAML", file=sys.stderr)
    sys.exit(2)
 # YAML quirk: bare `on:` at the top level parses to the Python `True`
 # (because `on` is a YAML 1.1 boolean alias). Handle both keys.
 def _get_on(d: dict) -> Any:
    if not isinstance(d, dict):
        return None
    if "on" in d:
        return d["on"]
    if True in d:
        return d[True]
    return None
 # ---------------------------------------------------------------------------
 # Rule 1 — workflow_dispatch.inputs block (Gitea 1.22.6 parser rejects)
 # ---------------------------------------------------------------------------
 def check_workflow_dispatch_inputs(filename: str, doc: Any) -> list[str]:
    """Return per-violation error lines if `workflow_dispatch.inputs` is set."""
    errors: list[str] = []
    on = _get_on(doc)
    if not isinstance(on, dict):
        return errors
    wd = on.get("workflow_dispatch")
    if isinstance(wd, dict) and wd.get("inputs"):
        errors.append(
            f"::error file={filename}::Rule 1 (FATAL): "
            f"`on.workflow_dispatch.inputs:` block detected. Gitea 1.22.6 "
            f"silently rejects the entire workflow with `[W] ignore invalid "
            f"workflow: unknown on type: map[...]`. Drop the `inputs:` block "
            f"and derive parameters from tag name / env / external query. "
            f"Memory: feedback_gitea_workflow_dispatch_inputs_unsupported."
        )
    return errors
 # ---------------------------------------------------------------------------
 # Rule 2 — on: workflow_run (not supported on Gitea 1.22.6)
 # ---------------------------------------------------------------------------
 def check_workflow_run_event(filename: str, doc: Any) -> list[str]:
    """Return per-violation error lines if `on: workflow_run:` is used."""
    errors: list[str] = []
    on = _get_on(doc)
    if isinstance(on, dict) and "workflow_run" in on:
        errors.append(
            f"::error file={filename}::Rule 2 (FATAL): `on: workflow_run:` "
            f"event used. Gitea 1.22.6 does NOT support `workflow_run` "
            f"(verified via modules/actions/workflows.go enumeration; "
            f"task #81). Workflow will fire for zero events. Use a "
            f"`schedule:` cron OR a `push:` trigger with `paths:` filter "
            f"on the upstream workflow file as the cross-workflow gate."
        )
    elif isinstance(on, list) and "workflow_run" in on:
        errors.append(
            f"::error file={filename}::Rule 2 (FATAL): `on: workflow_run` "
            f"in event list. Not supported on Gitea 1.22.6 — task #81."
        )
    return errors
 # ---------------------------------------------------------------------------
 # Rule 3 — name: contains "/" (breaks status-context tokenization)
 # ---------------------------------------------------------------------------
 def check_name_with_slash(filename: str, doc: Any) -> list[str]:
    """Return per-violation error lines if workflow `name:` contains a slash."""
    errors: list[str] = []
    if not isinstance(doc, dict):
        return errors
    name = doc.get("name")
    if isinstance(name, str) and "/" in name:
        errors.append(
            f"::error file={filename}::Rule 3 (FATAL): workflow `name: "
            f"{name!r}` contains `/`. The commit-status context convention "
            f"is `<workflow> / <job> (<event>)`; embedding `/` in the "
            f"workflow name makes downstream parsers (sop-tier-check, "
            f"status-reaper) tokenize ambiguously. Rename to use `-` or "
            f"` ` instead."
        )
    return errors
 # ---------------------------------------------------------------------------
 # Rule 4 — cross-file name collision
 # ---------------------------------------------------------------------------
 def check_name_collision_across_files(
    docs_by_file: dict[str, Any],
 ) -> list[str]:
    """Return per-collision error lines if two files share the same `name:`."""
    errors: list[str] = []
    by_name: dict[str, list[str]] = collections.defaultdict(list)
    for filename, doc in docs_by_file.items():
        if isinstance(doc, dict):
            n = doc.get("name")
            if isinstance(n, str) and n:
                by_name[n].append(filename)
    for n, files in sorted(by_name.items()):
        if len(files) > 1:
            errors.append(
                f"::error::Rule 4 (FATAL): workflow `name: {n!r}` collision "
                f"across {len(files)} files: {files}. Gitea routes "
                f"commit-status updates by `name`; collision yields "
                f"undefined behavior. Give each workflow a unique `name:`."
            )
    return errors
 # ---------------------------------------------------------------------------
 # Rule 5 — cross-repo `uses: org/repo/path@ref`
 # ---------------------------------------------------------------------------
 # `uses: <foo>@<ref>` — match the value form Gitea/act actually parse.
 # We need to distinguish:
 #   - `actions/checkout@<sha>`           OK (bare org/repo@ref, no subpath)
 #   - `./.gitea/actions/foo`             OK (local path)
 #   - `docker://image:tag`               OK (docker-image form)
 #   - `molecule-ai/molecule-ci/.gitea/actions/audit-force-merge@main`  BAD
 USES_CROSS_REPO_RE = re.compile(
    r"""^
    (?P<owner>[A-Za-z0-9_.\-]+)
    /
    (?P<repo>[A-Za-z0-9_.\-]+)
    /                       # mandatory subpath separator => cross-repo composite/reusable
    (?P<path>[^@\s]+)
    @
    (?P<ref>\S+)
    $""",
    re.VERBOSE,
 )
 def _iter_uses(doc: Any) -> Iterable[str]:
    """Yield every `uses:` string from job steps in a workflow document."""
    if not isinstance(doc, dict):
        return
    jobs = doc.get("jobs")
    if not isinstance(jobs, dict):
        return
    for job in jobs.values():
        if not isinstance(job, dict):
            continue
        # reusable workflow: `uses:` at the job level
        if isinstance(job.get("uses"), str):
            yield job["uses"]
        steps = job.get("steps")
        if not isinstance(steps, list):
            continue
        for step in steps:
            if isinstance(step, dict) and isinstance(step.get("uses"), str):
                yield step["uses"]
 def check_cross_repo_uses(filename: str, doc: Any) -> list[str]:
    """Return per-violation error lines for cross-repo `uses:` references."""
    errors: list[str] = []
    for uses in _iter_uses(doc):
        # Skip docker:// and local ./
        if uses.startswith(("docker://", "./", "../")):
            continue
        m = USES_CROSS_REPO_RE.match(uses.strip())
        if m:
            errors.append(
                f"::error file={filename}::Rule 5 (FATAL): cross-repo "
                f"`uses: {uses}` detected. Gitea 1.22.6 with "
                f"`[actions].DEFAULT_ACTIONS_URL=github` resolves this to "
                f"github.com/{m.group('owner')}/{m.group('repo')} which "
                f"404s (org suspended 2026-05-06). Inline the shared bash "
                f"into `.gitea/scripts/` until task #109 (actions mirror) "
                f"ships. Memory: feedback_gitea_cross_repo_uses_blocked."
            )
    return errors
 # ---------------------------------------------------------------------------
 # Rule 6 — heuristic: github.com/api refs without workflow-level
 #          GITHUB_SERVER_URL (WARN-not-FAIL per halt-condition 3)
 # ---------------------------------------------------------------------------
 # Match `https://api.github.com/...` (API call) — that's the actionable
 # pattern. We intentionally do NOT match `https://github.com/.../releases/
 # download/...` (jq-release pin) nor `https://github.com/${{ github.repository
 # }}` (OCI label) because those are documented benign references on current
 # main and would 100% false-positive (3 hits, per Phase 1 audit).
 GITHUB_API_REF_RE = re.compile(
    r"https://api\.github\.com\b|https://github\.com/api/",
    re.IGNORECASE,
 )
 def _has_workflow_level_server_url(doc: Any) -> bool:
    if not isinstance(doc, dict):
        return False
    env = doc.get("env")
    if isinstance(env, dict) and "GITHUB_SERVER_URL" in env:
        return True
    return False
 def check_github_server_url_missing(filename: str, doc: Any, raw: str) -> list[str]:
    """Return warn-lines (NOT errors) if api.github.com is referenced without
    workflow-level GITHUB_SERVER_URL. Heuristic — false-positives possible.
    """
    warns: list[str] = []
    if not GITHUB_API_REF_RE.search(raw):
        return warns
    if _has_workflow_level_server_url(doc):
        return warns
    warns.append(
        f"::warning file={filename}::Rule 6 (WARN, heuristic): file "
        f"references `https://api.github.com` without a workflow-level "
        f"`env.GITHUB_SERVER_URL: https://git.moleculesai.app`. The "
        f"act_runner default for `${{{{ github.server_url }}}}` is "
        f"github.com, which can break actions that auth-condition on "
        f"server_url (e.g. actions/setup-go). If this curl is "
        f"intentionally hitting GitHub (e.g. public release pin), ignore. "
        f"Memory: feedback_act_runner_github_server_url."
    )
    return warns
 # ---------------------------------------------------------------------------
 # Driver
 # ---------------------------------------------------------------------------
 def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(
        description="Lint Gitea Actions workflow YAML for 1.22.6-hostile shapes."
    )
    p.add_argument(
        "--workflow-dir",
        default=".gitea/workflows",
        help="Directory of workflow *.yml files (default: .gitea/workflows).",
    )
    args = p.parse_args(argv)
    wf_dir = Path(args.workflow_dir)
    if not wf_dir.exists():
        # Empty / missing dir = nothing to lint, not a failure.
        print(f"::notice::No workflow directory at {wf_dir}; skipping.")
        return 0
    yml_paths = sorted(
        glob.glob(str(wf_dir / "*.yml")) + glob.glob(str(wf_dir / "*.yaml"))
    )
    if not yml_paths:
        print(f"::notice::No workflow files in {wf_dir}; nothing to lint.")
        return 0
    fatal_errors: list[str] = []
    warnings: list[str] = []
    docs_by_file: dict[str, Any] = {}
    for path in yml_paths:
        rel = os.path.relpath(path)
        try:
            raw = Path(path).read_text()
            doc = yaml.safe_load(raw)
        except yaml.YAMLError as e:
            fatal_errors.append(
                f"::error file={rel}::YAML parse error: {e}. Cannot lint "
                f"a file the parser rejects."
            )
            continue
        docs_by_file[rel] = doc
        # Per-file checks
        fatal_errors.extend(check_workflow_dispatch_inputs(rel, doc))
        fatal_errors.extend(check_workflow_run_event(rel, doc))
        fatal_errors.extend(check_name_with_slash(rel, doc))
        fatal_errors.extend(check_cross_repo_uses(rel, doc))
        warnings.extend(check_github_server_url_missing(rel, doc, raw))
    # Cross-file checks
    fatal_errors.extend(check_name_collision_across_files(docs_by_file))
    # Emit warnings first (non-blocking)
    for w in warnings:
        print(w)
    if not fatal_errors:
        n = len(yml_paths)
        print(
            f"::notice::lint-workflow-yaml: {n} workflow file(s) checked, "
            f"no fatal Gitea-1.22.6-hostile shapes. "
            f"({len(warnings)} heuristic warning(s) emitted.)"
        )
        return 0
    # Emit fatal errors
    print(
        f"::error::lint-workflow-yaml: {len(fatal_errors)} fatal violation(s) "
        f"across {len(yml_paths)} workflow file(s). See rule documentation "
        f"in .gitea/scripts/lint-workflow-yaml.py docstring."
    )
    for e in fatal_errors:
        print(e)
    return 1
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/lint_continue_on_error_tracking.py
+++ b/.gitea/scripts/lint_continue_on_error_tracking.py
@ -1,436 +0,0 @@
 #!/usr/bin/env python3
 """lint_continue_on_error_tracking — Tier 2e per internal#350.
 Rule
 ----
 Every `continue-on-error: true` directive in `.gitea/workflows/*.yml`
 must be accompanied by a tracker reference comment within 2 lines
 (above OR below the directive's line). The reference is one of:
  * `# mc#NNNN`          — molecule-core issue
  * `# internal#NNNN`    — molecule-ai/internal issue
 The referenced issue must satisfy ALL of:
  1. Exists (HTTP 200 on `/repos/{owner}/{name}/issues/{num}`)
  2. `state == "open"`
  3. `created_at` is ≤ MAX_AGE_DAYS days ago (default 14)
 A passing reference establishes an audit trail and a forced renewal
 cadence — after 14 days the issue must either be CLOSED (the masked
 defect was fixed) or the comment must point at a NEW tracker
 (deliberate decision to keep masking, requires a paper-trail).
 The class this prevents
 -----------------------
 Phase-3-masked failures. `continue-on-error: true` on `platform-build`
 had been hiding mc#664-class regressions for ~3 weeks before #656
 surfaced them on 2026-05-12. A 14-day cap forces a tracker review
 cycle and surfaces mask-drift within at most 14 days of the original
 defect.
 Behaviour-based gate
 --------------------
 We parse via PyYAML AST (per `feedback_behavior_based_ast_gates`) to
 detect `continue-on-error: <truthy>` at job-key level, then map each
 location back to its source line via PyYAML's line-tracking loader.
 Comments are scanned from the raw text within a 2-line window of
 that source line. Reformatting (block-scalar vs flow-style) does not
 break the rule because the source-line anchor is the directive's
 own line.
 Exit codes
 ----------
  0 — every `continue-on-error: true` has a passing tracker, OR
      the issue-API endpoint returned 403/404 (token-scope; graceful
      degrade per Tier 2a contract — surface via ::error:: on stderr
      but don't red-X every PR over auth).
  1 — at least one violation (missing/closed/too-old/non-existent
      tracker).
  2 — env contract violation, YAML parse error, or workflows-dir
      missing.
 Env
 ---
  GITEA_TOKEN     — read scope on the configured repos.
                    Auto-injected `GITHUB_TOKEN` works for same-repo
                    issue reads; for `internal#NNN` we need a token
                    with `molecule-ai/internal` read scope. Use
                    DRIFT_BOT_TOKEN (same persona as other Tier 2
                    lints).
  GITEA_HOST      — e.g. git.moleculesai.app
  REPO            — `owner/name` for `mc#NNNN` lookups
  INTERNAL_REPO   — `owner/name` for `internal#NNNN` lookups
                    (defaults to derived `molecule-ai/internal`)
  WORKFLOWS_DIR   — defaults to `.gitea/workflows`
  MAX_AGE_DAYS    — defaults to 14
 Memory cross-links
 ------------------
  - internal#350 (the RFC that specs this lint)
  - mc#664 (the masked-3-weeks empirical case)
  - feedback_chained_defects_in_never_tested_workflows
  - feedback_behavior_based_ast_gates
  - feedback_strict_root_only_after_class_a
 """
 from __future__ import annotations
 import json
 import os
 import re
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any
 try:
    import yaml
 except ImportError:
    sys.stderr.write(
        "::error::PyYAML is required. Install with: pip install PyYAML\n"
    )
    sys.exit(2)
 # ---------------------------------------------------------------------------
 # Tracker comment regex.
 # Matches: `# mc#1234`, `# internal#42`, `# mc#1234 - description`
 # Does NOT match: `# mc1234` (missing inner #), `mc#1234` (no leading
 # `#` comment marker), `# MC#1234` (case-sensitive — `mc` and `internal`
 # are conventional lower-case repo slugs).
 TRACKER_RE = re.compile(
    r"#\s*(?P<slug>mc|internal)#(?P<num>\d+)\b"
 )
 # Truthy continue-on-error values we treat as "true". PyYAML decodes
 # `continue-on-error: true` to Python `True`. `continue-on-error: "true"`
 # decodes to the string "true" — Gitea's evaluator coerces strings,
 # so we treat string-`"true"` (case-insensitive) as truthy too.
 def _is_truthy_coe(v: Any) -> bool:
    if v is True:
        return True
    if isinstance(v, str) and v.strip().lower() == "true":
        return True
    return False
 # ---------------------------------------------------------------------------
 # Env contract
 # ---------------------------------------------------------------------------
 def _env(key: str, default: str | None = None) -> str:
    v = os.environ.get(key, default)
    return v if v is not None else ""
 def _require_env(key: str) -> str:
    v = os.environ.get(key)
    if not v:
        sys.stderr.write(f"::error::missing required env var: {key}\n")
        sys.exit(2)
    return v
 # ---------------------------------------------------------------------------
 # PyYAML line-tracking loader. yaml.SafeLoader nodes carry
 # `start_mark.line` (0-based); using construct_mapping with `deep=True`
 # preserves that on every node. We need the line of each
 # `continue-on-error` key so we can scan the source for comments
 # near it.
 # ---------------------------------------------------------------------------
 class _LineLoader(yaml.SafeLoader):
    """SafeLoader that annotates every dict with `__line__: {key: line}`."""
 def _construct_mapping(loader: yaml.SafeLoader, node: yaml.MappingNode) -> dict:
    mapping = loader.construct_mapping(node, deep=True)
    # Annotate per-key source lines so we can locate `continue-on-error`.
    lines: dict[str, int] = {}
    for k_node, _v_node in node.value:
        try:
            key = loader.construct_object(k_node, deep=True)
        except Exception:
            continue
        if isinstance(key, (str, int, bool)):
            lines[str(key)] = k_node.start_mark.line + 1  # 1-based
    if isinstance(mapping, dict):
        mapping["__lines__"] = lines
    return mapping
 _LineLoader.add_constructor(
    yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _construct_mapping
 )
 # ---------------------------------------------------------------------------
 # Issue lookup
 # ---------------------------------------------------------------------------
 def fetch_issue(slug_kind: str, num: int) -> tuple[str, dict | None]:
    """Return `(status, payload_or_none)`.
    status ∈ {"ok", "not_found", "forbidden", "error"}.
    """
    repo = (
        _env("REPO") if slug_kind == "mc" else _env("INTERNAL_REPO")
    )
    if not repo:
        # Fall through gracefully — caller treats as 403 (token-scope).
        return ("forbidden", None)
    host = _env("GITEA_HOST")
    token = _env("GITEA_TOKEN")
    url = f"https://{host}/api/v1/repos/{repo}/issues/{num}"
    req = urllib.request.Request(
        url,
        headers={
            "Authorization": f"token {token}",
            "Accept": "application/json",
        },
    )
    try:
        with urllib.request.urlopen(req, timeout=20) as resp:
            return ("ok", json.loads(resp.read()))
    except urllib.error.HTTPError as e:
        if e.code == 404:
            return ("not_found", None)
        if e.code in (401, 403):
            return ("forbidden", None)
        return ("error", None)
    except (urllib.error.URLError, TimeoutError, json.JSONDecodeError):
        return ("error", None)
 # ---------------------------------------------------------------------------
 # Locate every continue-on-error: <truthy> in a workflow doc, with line.
 # ---------------------------------------------------------------------------
 def find_coe_truthies(
    doc: Any, raw_lines: list[str]
 ) -> list[tuple[str, int]]:
    """Return list of (job_key, source_line_1based).
    `doc` is the LineLoader-parsed mapping. We descend `jobs.<key>` and
    return only those whose value is truthy per `_is_truthy_coe`.
    Job-step continue-on-error is intentionally NOT considered: it
    suppresses step-level failure rollup only, not job-level. The
    masking class this lint targets is the job-level rollup.
    """
    out: list[tuple[str, int]] = []
    if not isinstance(doc, dict):
        return out
    jobs = doc.get("jobs")
    if not isinstance(jobs, dict):
        return out
    for jkey, jbody in jobs.items():
        if jkey == "__lines__":
            continue
        if not isinstance(jbody, dict):
            continue
        if "continue-on-error" not in jbody:
            continue
        v = jbody["continue-on-error"]
        if not _is_truthy_coe(v):
            continue
        line = jbody.get("__lines__", {}).get("continue-on-error")
        if not line:
            # PyYAML line-tracking shouldn't miss but guard for safety.
            # Fall back to grepping the raw text.
            line = _grep_first_coe_line(raw_lines, jkey) or 1
        out.append((str(jkey), int(line)))
    return out
 def _grep_first_coe_line(raw_lines: list[str], jkey: str) -> int | None:
    """Fallback: find the first `continue-on-error:` line after a `jkey:` line."""
    saw_job = False
    for i, line in enumerate(raw_lines, start=1):
        if re.match(rf"^\s*{re.escape(jkey)}\s*:", line):
            saw_job = True
            continue
        if saw_job and "continue-on-error" in line:
            return i
    return None
 # ---------------------------------------------------------------------------
 # Scan window for tracker comment
 # ---------------------------------------------------------------------------
 WINDOW = 2  # lines above OR below the directive's line (inclusive)
 def find_tracker_in_window(
    raw_lines: list[str], line_1based: int
 ) -> tuple[str, int] | None:
    """Return (slug, num) if a `# mc#NNN`/`# internal#NNN` appears
    in raw_lines within ±WINDOW lines of `line_1based`. None otherwise.
    We scan the directive's own line (it may carry an inline comment
    like `continue-on-error: true  # mc#3`) plus ±WINDOW.
    """
    lo = max(1, line_1based - WINDOW)
    hi = min(len(raw_lines), line_1based + WINDOW)
    for i in range(lo, hi + 1):
        line = raw_lines[i - 1]
        # Only the comment portion (after `#`) is considered, so
        # trailing-inline comments on the directive line are matched.
        m = TRACKER_RE.search(line)
        if m:
            return (m.group("slug"), int(m.group("num")))
    return None
 # ---------------------------------------------------------------------------
 # Tracker validation
 # ---------------------------------------------------------------------------
 def validate_tracker(
    slug: str, num: int, max_age_days: int
 ) -> tuple[bool, str]:
    """Return (ok?, reason). On 403, ok=True is returned with reason
    explaining graceful-degrade — caller treats 403 as a non-fatal
    skip (same as Tier 2a contract).
    """
    status, payload = fetch_issue(slug, num)
    if status == "forbidden":
        sys.stderr.write(
            f"::error::issue {slug}#{num} unreadable (HTTP 403 — token "
            f"scope). Cannot validate; skipping this check to avoid "
            f"red-X on every PR. Fix the token, not the lint.\n"
        )
        return (True, "forbidden — skipped")
    if status == "not_found":
        return (False, f"{slug}#{num} does not exist (404)")
    if status == "error":
        sys.stderr.write(
            f"::error::issue {slug}#{num} fetch errored — treating as "
            f"unverified, skipping this check.\n"
        )
        return (True, "fetch-error — skipped")
    assert payload is not None
    state = payload.get("state", "")
    if state != "open":
        return (False, f"{slug}#{num} state={state!r} (must be open)")
    created = payload.get("created_at", "")
    try:
        # Gitea returns ISO-8601 with timezone; Python 3.11+
        # fromisoformat handles `Z` suffix natively from 3.11. Older
        # runtimes need explicit replace.
        created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
    except ValueError:
        return (False, f"{slug}#{num} created_at unparseable: {created!r}")
    age = datetime.now(timezone.utc) - created_dt
    # Inclusive boundary at MAX_AGE_DAYS: `age.days` truncates to a
    # whole-day floor, so an issue created 14d 0h 5m ago has
    # `age.days == 14` and passes; one created 15d 0h 0m ago has
    # `age.days == 15` and fails. This is the convention specified
    # in internal#350 ("≤14 days old").
    if age.days > max_age_days:
        return (
            False,
            f"{slug}#{num} is {age.days} days old (>{max_age_days}d cap). "
            f"Close-or-renew the tracker.",
        )
    return (True, f"{slug}#{num} open, {age.days}d old, ≤{max_age_days}d")
 # ---------------------------------------------------------------------------
 # Driver
 # ---------------------------------------------------------------------------
 def _iter_workflow_files(wf_dir: Path) -> list[Path]:
    return sorted(list(wf_dir.glob("*.yml")) + list(wf_dir.glob("*.yaml")))
 def run() -> int:
    wf_dir = Path(_env("WORKFLOWS_DIR", ".gitea/workflows"))
    max_age = int(_env("MAX_AGE_DAYS", "14"))
    # Defaults for INTERNAL_REPO when unset (best-effort guess based on
    # the convention `mc#` = same repo, `internal#` = molecule-ai/internal).
    if not os.environ.get("INTERNAL_REPO"):
        os.environ["INTERNAL_REPO"] = "molecule-ai/internal"
    if not wf_dir.is_dir():
        sys.stderr.write(
            f"::error::workflows directory not found: {wf_dir}\n"
        )
        return 2
    yml_files = _iter_workflow_files(wf_dir)
    if not yml_files:
        print(f"::notice::no workflow files under {wf_dir}; nothing to lint.")
        return 0
    violations: list[str] = []
    notices: list[str] = []
    total_coe_true = 0
    for path in yml_files:
        raw = path.read_text(encoding="utf-8")
        raw_lines = raw.splitlines()
        try:
            doc = yaml.load(raw, Loader=_LineLoader)
        except yaml.YAMLError as e:
            sys.stderr.write(
                f"::error file={path}::YAML parse error: {e}. Skipping "
                f"this file (lint-workflow-yaml will catch separately).\n"
            )
            continue
        coe_locs = find_coe_truthies(doc, raw_lines)
        for jkey, line in coe_locs:
            total_coe_true += 1
            tracker = find_tracker_in_window(raw_lines, line)
            if tracker is None:
                violations.append(
                    f"::error file={path},line={line}::lint-continue-on-error-"
                    f"tracking (Tier 2e): job '{jkey}' has "
                    f"`continue-on-error: true` at line {line} with no "
                    f"`# mc#NNNN` or `# internal#NNNN` tracker comment "
                    f"within {WINDOW} lines. Add a tracker reference so "
                    f"this mask has a forced 14-day renewal cycle. "
                    f"Memory: feedback_chained_defects_in_never_tested_workflows."
                )
                continue
            slug, num = tracker
            ok, reason = validate_tracker(slug, num, max_age)
            if ok:
                notices.append(
                    f"::notice::{path.name} job '{jkey}' (line {line}): "
                    f"{reason}"
                )
            else:
                violations.append(
                    f"::error file={path},line={line}::lint-continue-on-error-"
                    f"tracking (Tier 2e): job '{jkey}' "
                    f"`continue-on-error: true` references {slug}#{num}, "
                    f"but {reason}. FIX: close/fix the underlying defect "
                    f"and flip continue-on-error: false, OR file a fresh "
                    f"tracker and update the comment."
                )
    for n in notices:
        print(n)
    if violations:
        print(
            f"::error::lint-continue-on-error-tracking: "
            f"{len(violations)} violation(s) across {len(yml_files)} "
            f"workflow file(s) (of {total_coe_true} `continue-on-error: "
            f"true` directives in total)."
        )
        for v in violations:
            print(v)
        return 1
    print(
        f"::notice::lint-continue-on-error-tracking: "
        f"all {total_coe_true} `continue-on-error: true` directive(s) "
        f"have valid trackers (open, ≤{max_age}d old)."
    )
    return 0
 if __name__ == "__main__":
    sys.exit(run())
--- a/.gitea/scripts/lint_mask_pr_atomicity.py
+++ b/.gitea/scripts/lint_mask_pr_atomicity.py
@ -1,361 +0,0 @@
 #!/usr/bin/env python3
 """lint_mask_pr_atomicity — Tier 2d structural enforcement per internal#350.
 Rule
 ----
 A PR whose diff touches `.gitea/workflows/ci.yml` AND modifies EITHER:
  - any `continue-on-error:` value, OR
  - the `all-required` sentinel job's `needs:` block
 must EITHER:
  - Touch BOTH atomically in the same PR (preferred), OR
  - Cross-link the paired PR via a literal `Paired: #NNN` reference in
    the PR body OR in any commit message between BASE_SHA and HEAD_SHA.
 The class this prevents
 -----------------------
 PR#665 (interim `continue-on-error: true` on `platform-build`) and
 PR#668 (sentinel-`needs` demotion of the same job) were designed as a
 pair but merged solo — #665 landed at 04:47Z 2026-05-12, #668 was still
 open at 05:07Z when the main-red watchdog (#674) fired. Result: ~20
 minutes of `main` red and a cascade of false-positives on unrelated PRs.
 The lint operates on the YAML AST (PyYAML), not grep, per
 `feedback_behavior_based_ast_gates`: a refactor that moves `continue-on-error`
 between job keys, or renames the `all-required` job, would still be
 detected because we walk the parsed structure.
 Why this works on Gitea 1.22.6
 ------------------------------
 We don't use any 1.22.6-missing endpoints (no `/actions/runs/*`, no
 `branch_protections/*` — Tier 2f/g need those; Tier 2d does not). All
 required inputs come from the workflow `pull_request` event payload
 (BASE_SHA, HEAD_SHA, PR_BODY) and from local git via `git show`/`git log`.
 The auto-injected `GITHUB_TOKEN` is enough; we don't need
 DRIFT_BOT_TOKEN.
 Exit codes
 ----------
  0 — ci.yml not in diff, OR diff is no-op for the rule predicates,
      OR atomicity satisfied (both touched), OR a valid `Paired: #NNN`
      reference is present.
  1 — exactly ONE of {coe, sentinel-needs} touched AND no valid
      `Paired: #NNN` reference. The split-pair regression class.
  2 — env contract violation (BASE_SHA / HEAD_SHA missing) or YAML
      parse error on either side.
 Env
 ---
  BASE_SHA          — PR base (pull_request.base.sha)
  HEAD_SHA          — PR head (pull_request.head.sha)
  PR_BODY           — pull_request.body (may be empty)
  CI_WORKFLOW_PATH  — defaults to `.gitea/workflows/ci.yml`
  SENTINEL_JOB_KEY  — defaults to `all-required`
 Memory cross-links
 ------------------
  - internal#350 (the RFC that specs this lint)
  - PR#665 / PR#668 (the empirical split-pair)
  - mc#664 (the main-red incident)
  - feedback_strict_root_only_after_class_a
  - feedback_behavior_based_ast_gates
 """
 from __future__ import annotations
 import os
 import re
 import subprocess
 import sys
 from typing import Any
 try:
    import yaml
 except ImportError:
    sys.stderr.write(
        "::error::PyYAML is required. Install with: pip install PyYAML\n"
    )
    sys.exit(2)
 # ---------------------------------------------------------------------------
 # YAML quirk: bare `on:` at the top level becomes Python `True` because
 # `on` is a YAML 1.1 boolean. Not used here but documented for future
 # editors who copy from this module.
 # ---------------------------------------------------------------------------
 # `Paired: #NNN` reference. `#` is mandatory, NNN must be digits. Any
 # surrounding markdown/whitespace is fine. The match is case-sensitive
 # on `Paired:` because lower-case `paired:` collides with conversational
 # prose ("paired: see comment above") and the convention is the exact
 # capitalisation.
 PAIRED_RE = re.compile(r"\bPaired:\s*#(?P<num>\d+)\b")
 # ---------------------------------------------------------------------------
 # Env contract
 # ---------------------------------------------------------------------------
 def _env(key: str, default: str | None = None) -> str:
    v = os.environ.get(key, default)
    return v if v is not None else ""
 def _require_env(key: str) -> str:
    v = os.environ.get(key)
    if not v:
        sys.stderr.write(f"::error::missing required env var: {key}\n")
        sys.exit(2)
    return v
 # ---------------------------------------------------------------------------
 # git-show helper. Returns None when the path doesn't exist on that side
 # (new file, deleted file, or rename — git returns exit 128 with "fatal:
 # path not in tree"). We treat None as "no rule predicate triggered on
 # that side".
 # ---------------------------------------------------------------------------
 def git_show(sha: str, path: str) -> str | None:
    r = subprocess.run(
        ["git", "show", f"{sha}:{path}"],
        capture_output=True,
        text=True,
    )
    if r.returncode != 0:
        return None
    return r.stdout
 def git_log_messages(base_sha: str, head_sha: str) -> str:
    r = subprocess.run(
        ["git", "log", "--format=%B", f"{base_sha}..{head_sha}"],
        capture_output=True,
        text=True,
    )
    if r.returncode != 0:
        return ""
    return r.stdout
 def git_diff_paths(base_sha: str, head_sha: str) -> list[str]:
    r = subprocess.run(
        ["git", "diff", "--name-only", f"{base_sha}..{head_sha}"],
        capture_output=True,
        text=True,
    )
    if r.returncode != 0:
        return []
    return [p for p in r.stdout.splitlines() if p.strip()]
 # ---------------------------------------------------------------------------
 # Predicate 1 — any `continue-on-error` value changed between base and head
 # ---------------------------------------------------------------------------
 def _collect_coe(doc: Any) -> dict[str, Any]:
    """Walk every job in `jobs.*` and collect its continue-on-error value.
    Returns a dict {job_key: coe_value}. Missing keys are absent from
    the dict (NOT `False` — distinguishes "added the key" from
    "unchanged absent"). Job-step `continue-on-error` is NOT considered
    — only job-level, because that's the value that masks job status
    rollup, which is the class this lint targets.
    """
    out: dict[str, Any] = {}
    if not isinstance(doc, dict):
        return out
    jobs = doc.get("jobs")
    if not isinstance(jobs, dict):
        return out
    for k, j in jobs.items():
        if not isinstance(j, dict):
            continue
        if "continue-on-error" in j:
            out[k] = j["continue-on-error"]
    return out
 def coe_changed(base_doc: Any, head_doc: Any) -> tuple[bool, list[str]]:
    """Return (changed?, [reasons]) describing per-job coe diffs."""
    base = _collect_coe(base_doc)
    head = _collect_coe(head_doc)
    reasons: list[str] = []
    all_keys = set(base) | set(head)
    for k in sorted(all_keys):
        b = base.get(k, "<absent>")
        h = head.get(k, "<absent>")
        if b != h:
            reasons.append(f"job '{k}' continue-on-error: {b!r} → {h!r}")
    return (bool(reasons), reasons)
 # ---------------------------------------------------------------------------
 # Predicate 2 — sentinel job's `needs:` changed
 # ---------------------------------------------------------------------------
 def _collect_needs(doc: Any, sentinel_key: str) -> list[str] | None:
    """Return the sentinel job's needs list (sorted) or None if absent."""
    if not isinstance(doc, dict):
        return None
    jobs = doc.get("jobs")
    if not isinstance(jobs, dict):
        return None
    j = jobs.get(sentinel_key)
    if not isinstance(j, dict):
        return None
    needs = j.get("needs")
    if needs is None:
        return []
    if isinstance(needs, str):
        return [needs]
    if isinstance(needs, list):
        # Sort because `needs:` is order-insensitive at the engine
        # level; a reorder is not a semantic change and shouldn't
        # trip the lint.
        return sorted(str(x) for x in needs)
    return None
 def sentinel_needs_changed(
    base_doc: Any, head_doc: Any, sentinel_key: str
 ) -> tuple[bool, str]:
    """Return (changed?, reason)."""
    base = _collect_needs(base_doc, sentinel_key)
    head = _collect_needs(head_doc, sentinel_key)
    if base == head:
        return (False, "")
    return (
        True,
        f"sentinel '{sentinel_key}'.needs: {base!r} → {head!r}",
    )
 # ---------------------------------------------------------------------------
 # Predicate 3 — `Paired: #NNN` present in body or any commit message
 # ---------------------------------------------------------------------------
 def find_paired_refs(pr_body: str, commit_log: str) -> list[str]:
    """Return list of `#NNN` strings found (deduped, sorted)."""
    found: set[str] = set()
    for src in (pr_body, commit_log):
        for m in PAIRED_RE.finditer(src or ""):
            found.add(m.group("num"))
    return sorted(found)
 # ---------------------------------------------------------------------------
 # Driver
 # ---------------------------------------------------------------------------
 def _parse(content: str | None, label: str) -> Any:
    if content is None:
        return None
    try:
        return yaml.safe_load(content)
    except yaml.YAMLError as e:
        sys.stderr.write(f"::error::YAML parse error on {label}: {e}\n")
        sys.exit(2)
 def run() -> int:
    base_sha = _require_env("BASE_SHA")
    head_sha = _require_env("HEAD_SHA")
    pr_body = _env("PR_BODY", "")
    ci_path = _env("CI_WORKFLOW_PATH", ".gitea/workflows/ci.yml")
    sentinel_key = _env("SENTINEL_JOB_KEY", "all-required")
    # Step 0 — is ci.yml even in the diff? If not, the lint doesn't apply.
    changed_paths = git_diff_paths(base_sha, head_sha)
    if ci_path not in changed_paths:
        print(
            f"::notice::{ci_path} not in PR diff; lint-mask-pr-atomicity "
            f"skipped (no atomicity risk)."
        )
        return 0
    base_yml = git_show(base_sha, ci_path)
    head_yml = git_show(head_sha, ci_path)
    base_doc = _parse(base_yml, f"{ci_path}@{base_sha}")
    head_doc = _parse(head_yml, f"{ci_path}@{head_sha}")
    # If the file is newly added (no base), no flip is possible — every
    # value is "newly introduced", not "changed". Tier 2e covers the
    # tracking-issue check for new continue-on-error: true. Exit 0.
    if base_doc is None:
        print(
            f"::notice::{ci_path} newly added in this PR; no flip to "
            f"analyse — lint-mask-pr-atomicity skipped."
        )
        return 0
    # If the file is deleted on head, ditto — no atomicity question.
    if head_doc is None:
        print(
            f"::notice::{ci_path} deleted in this PR; "
            f"lint-mask-pr-atomicity skipped."
        )
        return 0
    coe_yes, coe_reasons = coe_changed(base_doc, head_doc)
    needs_yes, needs_reason = sentinel_needs_changed(
        base_doc, head_doc, sentinel_key
    )
    if not coe_yes and not needs_yes:
        print(
            f"::notice::{ci_path} touched but neither continue-on-error "
            f"nor sentinel '{sentinel_key}'.needs changed — no atomicity "
            f"risk. OK."
        )
        return 0
    if coe_yes and needs_yes:
        print(
            f"::notice::Atomic change detected: both continue-on-error "
            f"AND sentinel '{sentinel_key}'.needs touched in same PR. OK."
        )
        for r in coe_reasons:
            print(f"  - {r}")
        print(f"  - {needs_reason}")
        return 0
    # Exactly one side touched — require Paired: #NNN reference.
    commit_log = git_log_messages(base_sha, head_sha)
    paired = find_paired_refs(pr_body, commit_log)
    one_side = "continue-on-error" if coe_yes else f"sentinel '{sentinel_key}'.needs"
    other_side = (
        f"sentinel '{sentinel_key}'.needs" if coe_yes else "continue-on-error"
    )
    if paired:
        print(
            f"::notice::Split-pair detected ({one_side} changed without "
            f"{other_side}), but Paired reference(s) present: "
            f"{', '.join('#' + n for n in paired)}. OK."
        )
        for r in coe_reasons:
            print(f"  - {r}")
        if needs_reason:
            print(f"  - {needs_reason}")
        return 0
    # The failure mode this lint exists to prevent.
    print(
        f"::error file={ci_path}::lint-mask-pr-atomicity (Tier 2d): "
        f"PR touches {one_side} in {ci_path} but NOT {other_side}, "
        f"and no `Paired: #NNN` reference was found in the PR body or "
        f"in commit messages between {base_sha[:8]}..{head_sha[:8]}. "
        f"This is the PR#665+#668 split-pair regression class "
        f"(see internal#350, mc#664). FIX: either (a) include the "
        f"matching {other_side} change in the same PR (preferred), or "
        f"(b) add `Paired: #NNN` (literal, capital P, with `#`) to the "
        f"PR body or a commit message referencing the paired PR."
    )
    for r in coe_reasons:
        print(f"  - {r}")
    if needs_reason:
        print(f"  - {needs_reason}")
    return 1
 if __name__ == "__main__":
    sys.exit(run())
--- a/.gitea/scripts/lint_pre_flip_continue_on_error.py
+++ b/.gitea/scripts/lint_pre_flip_continue_on_error.py
@ -1,681 +0,0 @@
 #!/usr/bin/env python3
 """lint-pre-flip-continue-on-error — block a PR that flips a job from
 ``continue-on-error: true`` to ``continue-on-error: false`` (or removes
 the key while the base had it ``true``) without proof that the job's
 recent runs on the target branch are actually green.
 Empirical class — PR #656 / mc#664:
  PR #656 (RFC internal#219 Phase 4) flipped 5 ``platform-build``-class
  jobs ``continue-on-error: true → false`` on the basis of a
  "verified green on main via combined-status check". But that "green"
  was the LIE produced by the prior ``continue-on-error: true``:
  Gitea Quirk #10 (internal#342 + dup #287) — when a step inside a
  job marked ``continue-on-error: true`` fails, the job-level status
  is still rolled up as ``success``. So the precondition the PR
  claimed to verify was structurally fooled by the bug being
  flipped.
  mc#664 then captured the surfaced defects (2 unrelated, mutually-
  masked regressions):
    Class 1: sqlmock helper drift since 2f36bb9a (24 days old)
    Class 2: OFFSEC-001 contract collision since 7d1a189f (1 day old)
  Codified 04:35Z as hongming-pc2 charter §SOP-N rule (e)
  "run-log-grep-before-flip": pull the actual run log + grep for
  ``--- FAIL`` / ``FAIL\\s`` BEFORE flipping; don't trust the masked
  combined-status.
 This script structurally enforces that rule at PR time.
 How it works (one PR tick):
  1. Parse the diff: compare ``.gitea/workflows/*.yml`` at PR base
     vs PR head. For each file present in both, parse the YAML AST
     and walk ``jobs.<key>.continue-on-error`` on each side. A
     "flip" is base ∈ {true} AND head ∈ {false, None/absent}. We
     coerce truthy/falsy per YAML semantics (PyYAML normalizes
     ``true``/``True``/``yes`` to ``True``).
  2. For each flipped job, derive its commit-status context name as
     ``"{workflow.name} / {job.name or job.key} (push)"`` — that's
     how Gitea Actions emits the context for runs on
     ``main``/``staging`` (push event, see also expected_context()
     in ci-required-drift.py).
  3. Pull the last N commits of the target branch (PR base), fetch
     combined commit-status per commit, scan ``statuses[]`` for
     contexts matching ANY of the flipped jobs. For each match,
     fetch the actual run log via the web-UI route
     ``{server_url}/{repo}/actions/runs/{run_id}/jobs/{job_idx}/logs``
     (per memory ``reference_gitea_actions_log_fetch`` — Gitea 1.22.6
     lacks REST ``/actions/runs/*`` endpoints; the web-UI route is the
     only working path; see ``reference_gitea_1_22_6_lacks_rest_rerun_endpoints``).
  4. Grep each log for the Go-test failure markers ``--- FAIL`` /
     ``FAIL\\s+<package>`` AND the bash-step error sentinel
     ``::error::``. If ANY recent log shows any of these AND the
     status itself reads ``success``, the job was masked. ``::error::``
     the flip with the offending test name + offending run URL +
     the regression commit (HEAD of the run).
  5. Exit 1 if any flips have at least one masked run; exit 0
     otherwise.
 Halt-on-noise contract:
  - If a recent log fetch 404s (already-pruned-via-act_runner-gc,
     transient gitea-web outage): emit ``::warning::`` and treat the
     run as "log unavailable" — does NOT block the flip; logged so
     a curious reviewer can re-run.
  - If a flipped job has ZERO recent runs on the target branch (newly
     added workflow): emit ``::warning::`` "no run history to verify"
     and allow the flip. This is the only way a NEW workflow can ever
     ship with ``continue-on-error: false``; otherwise we'd have a
     chicken-and-egg.
 Behavior-based AST gate per ``feedback_behavior_based_ast_gates``:
  - YAML parsed via PyYAML safe_load on BOTH sides of the diff
  - No grep-by-line — formatting changes (comment churn, key order)
    don't false-positive a flip
  - Job-key match — so a rename ``platform-build → core-be-build``
    appears as a DELETE + an ADD, not a flip (the delete side has no
    new value to compare against; the add side has no base side).
 Run locally (works against this repo, requires PyYAML + Gitea token
 that can read combined-commit-status):
    GITEA_TOKEN=... GITEA_HOST=git.moleculesai.app \\
      REPO=molecule-ai/molecule-core BASE_REF=main \\
      BASE_SHA=$(git rev-parse origin/main) \\
      HEAD_SHA=$(git rev-parse HEAD) \\
      python3 .gitea/scripts/lint_pre_flip_continue_on_error.py \\
        --dry-run
 Cross-links: PR#656, mc#664, PR#665 (the interim re-mask),
 Quirk #10 (internal#342 + dup #287), hongming-pc2 charter §SOP-N
 rule (e), feedback_strict_root_only_after_class_a,
 feedback_no_shared_persona_token_use.
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import subprocess
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from typing import Any
 import yaml  # PyYAML 6.0.2 — installed by the workflow before this runs.
 # --------------------------------------------------------------------------
 # Environment (read at module-import; runtime contract enforced in main())
 # --------------------------------------------------------------------------
 def _env(key: str, *, default: str = "") -> str:
    return os.environ.get(key, default)
 GITEA_TOKEN = _env("GITEA_TOKEN")
 GITEA_HOST = _env("GITEA_HOST")
 REPO = _env("REPO")
 BASE_REF = _env("BASE_REF", default="main")
 BASE_SHA = _env("BASE_SHA")
 HEAD_SHA = _env("HEAD_SHA")
 # How many recent commits to scan on the target branch. 5 by default;
 # enough to catch a job that only fails intermittently, not so many
 # that the script paginates needlessly. Per spec.
 RECENT_COMMITS_N = int(_env("RECENT_COMMITS_N", default="5"))
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
 WEB = f"https://{GITEA_HOST}" if GITEA_HOST else ""
 # Failure markers we grep for in the run log.
 #   --- FAIL — Go test failure marker
 #   FAIL\s   — `FAIL  github.com/x/y` package-level rollup
 #   ::error:: — bash-step `::error::` lines (the lint-curl-status-capture
 #               pattern: a `python3 <<PY` block writing `::error::` then
 #               sys.exit(1); also any shell `echo "::error::..."` from
 #               jobs that wrap pytest/eslint/etc. and convert
 #               non-zero exits into masked-by-CoE status)
 FAIL_PATTERNS = (
    "--- FAIL",
    "FAIL\t",
    "FAIL ",
    "::error::",
 )
 def _require_runtime_env() -> None:
    for key in ("GITEA_TOKEN", "GITEA_HOST", "REPO", "BASE_REF", "BASE_SHA", "HEAD_SHA"):
        if not os.environ.get(key):
            sys.stderr.write(f"::error::missing required env var: {key}\n")
            sys.exit(2)
 # --------------------------------------------------------------------------
 # Tiny HTTP helper (no requests dependency)
 # Mirrors the api()/ApiError contract in ci-required-drift.py +
 # main-red-watchdog.py per feedback_api_helper_must_raise_not_return_dict.
 # --------------------------------------------------------------------------
 class ApiError(RuntimeError):
    """Raised when a Gitea API/web call cannot be trusted to have succeeded.
    Soft-failure on non-2xx is the duplicate-write bug factory in
    find-or-create flows (PR #112 Five-Axis). Here it would mean a
    transient gitea-web 502 silently allows a flip whose recent runs
    we couldn't actually verify — exactly the regression class this
    lint exists to close.
    """
 def http(
    method: str,
    url: str,
    *,
    body: dict | None = None,
    headers: dict[str, str] | None = None,
    expect_json: bool = True,
    timeout: int = 30,
 ) -> tuple[int, Any, bytes]:
    """Tiny HTTP helper around urllib.
    Returns (status, parsed_or_None, raw_bytes). Raises ApiError on any
    non-2xx response. ``expect_json=False`` returns raw bytes in the
    parsed slot (for log-fetch from the web-UI which returns text/plain).
    """
    final_headers = {
        "Authorization": f"token {GITEA_TOKEN}",
        "Accept": "application/json" if expect_json else "text/plain",
    }
    if headers:
        final_headers.update(headers)
    data = None
    if body is not None:
        data = json.dumps(body).encode("utf-8")
        final_headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, method=method, data=data, headers=final_headers)
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            raw = resp.read()
            status = resp.status
    except urllib.error.HTTPError as e:
        raw = e.read() or b""
        status = e.code
    if not (200 <= status < 300):
        snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
        raise ApiError(f"{method} {url} → HTTP {status}: {snippet}")
    if not expect_json:
        return status, raw, raw
    if not raw:
        return status, None, raw
    try:
        return status, json.loads(raw), raw
    except json.JSONDecodeError as e:
        raise ApiError(f"{method} {url} → HTTP {status} but body is not JSON: {e}") from e
 def api(method: str, path: str, *, body: dict | None = None, query: dict[str, str] | None = None) -> tuple[int, Any]:
    """Read-shaped Gitea REST helper. Path is API-relative (``/repos/...``)."""
    url = f"{API}{path}"
    if query:
        url = f"{url}?{urllib.parse.urlencode(query)}"
    status, parsed, _ = http(method, url, body=body, expect_json=True)
    return status, parsed
 # --------------------------------------------------------------------------
 # YAML parsing — coerce truthy/falsy for continue-on-error
 # --------------------------------------------------------------------------
 def _coerce_coe(val: Any) -> bool:
    """Coerce a continue-on-error YAML value to bool.
    PyYAML safe_load normalizes ``true``/``True``/``yes``/``on`` to
    Python ``True`` and ``false``/``False``/``no``/``off`` / absence
    to ``False`` (we treat absence/None as False here too — that's the
    GitHub Actions default semantics).
    Edge cases:
      - String ``"true"`` (quoted in YAML) — kept as the string
        ``"true"``, falsy under bool() but a flip we DO care about
        catching. Normalize string forms case-insensitively to bool
        so the diff is consistent with the runtime behavior of
        Gitea Actions, which YAML-parses the same way.
    """
    if isinstance(val, bool):
        return val
    if val is None:
        return False
    if isinstance(val, str):
        return val.strip().lower() in ("true", "yes", "on", "1")
    return bool(val)
 def jobs_coe_map(workflow_doc: dict) -> dict[str, bool]:
    """Return ``{job_key: continue_on_error_bool}`` for every job in
    the workflow. Job-level ``continue-on-error`` only — does NOT
    descend into per-step ``continue-on-error`` (step-level CoE
    masking is a separate class and is handled by the test suite
    + reviewer, not by this gate — see Future Work in the workflow
    YAML).
    """
    out: dict[str, bool] = {}
    jobs = workflow_doc.get("jobs")
    if not isinstance(jobs, dict):
        return out
    for key, job in jobs.items():
        if not isinstance(job, dict):
            continue
        out[key] = _coerce_coe(job.get("continue-on-error"))
    return out
 def workflow_name(workflow_doc: dict, *, fallback: str = "") -> str:
    """Top-level ``name:`` of the workflow. Falls back to the filename
    (without extension) per Gitea Actions semantics."""
    n = workflow_doc.get("name")
    if isinstance(n, str) and n.strip():
        return n.strip()
    return fallback
 def job_display_name(workflow_doc: dict, job_key: str) -> str:
    """``jobs.<key>.name`` if present, else the key. Mirrors
    expected_context() in ci-required-drift.py."""
    job = workflow_doc.get("jobs", {}).get(job_key)
    if isinstance(job, dict):
        n = job.get("name")
        if isinstance(n, str) and n.strip():
            return n.strip()
    return job_key
 def context_name(workflow_name_str: str, job_name_str: str, event: str = "push") -> str:
    """Render the commit-status context the way Gitea Actions emits it.
    Default ``event="push"`` because recent-runs-on-main are push events;
    callers can override to ``"pull_request"`` for PR-context lookups."""
    return f"{workflow_name_str} / {job_name_str} ({event})"
 # --------------------------------------------------------------------------
 # Diff detection — flips, not arbitrary changes
 # --------------------------------------------------------------------------
 def detect_flips(
    base_workflows: dict[str, str],
    head_workflows: dict[str, str],
 ) -> list[dict]:
    """Compare per-file CoE maps; return a list of flip records.
    Inputs are ``{path: yaml_text}`` for both sides. Output records
    have the shape::
        {
          "workflow_path": ".gitea/workflows/ci.yml",
          "workflow_name": "CI",
          "job_key":   "platform-build",
          "job_name":  "Platform (Go)",
          "context":   "CI / Platform (Go) (push)",
        }
    A flip is base[CoE] ∈ {True} AND head[CoE] ∈ {False}. Files
    only present on one side are skipped — adding a new workflow
    with ``CoE: false`` is fine (no history to mask), and removing
    a workflow can't possibly flip anything.
    """
    flips: list[dict] = []
    for path, base_text in base_workflows.items():
        if path not in head_workflows:
            continue
        try:
            base_doc = yaml.safe_load(base_text) or {}
            head_doc = yaml.safe_load(head_workflows[path]) or {}
        except yaml.YAMLError as e:
            # Don't block on a parse error — the YAML lint workflows
            # catch invalid YAML separately. Just warn so the failing
            # file is visible.
            sys.stderr.write(f"::warning file={path}::YAML parse error: {e}\n")
            continue
        if not isinstance(base_doc, dict) or not isinstance(head_doc, dict):
            continue
        base_map = jobs_coe_map(base_doc)
        head_map = jobs_coe_map(head_doc)
        wf_name = workflow_name(head_doc, fallback=os.path.basename(path).rsplit(".", 1)[0])
        for job_key, base_val in base_map.items():
            if job_key not in head_map:
                continue  # job removed — not a flip
            if base_val is True and head_map[job_key] is False:
                flips.append({
                    "workflow_path": path,
                    "workflow_name": wf_name,
                    "job_key": job_key,
                    "job_name": job_display_name(head_doc, job_key),
                    "context": context_name(wf_name, job_display_name(head_doc, job_key), "push"),
                })
    return flips
 # --------------------------------------------------------------------------
 # Git: snapshot every .gitea/workflows/*.yml at a SHA (no checkout)
 # --------------------------------------------------------------------------
 def _git(*args: str, cwd: str | None = None) -> str:
    """Run ``git`` and return stdout (text)."""
    result = subprocess.run(
        ["git", *args],
        capture_output=True,
        text=True,
        check=False,
        cwd=cwd,
    )
    if result.returncode != 0:
        raise RuntimeError(f"git {args!r} failed: {result.stderr.strip()}")
    return result.stdout
 def workflows_at_sha(sha: str, *, repo_dir: str | None = None) -> dict[str, str]:
    """Read every ``.gitea/workflows/*.yml`` blob at ``sha``.
    Uses ``git ls-tree`` + ``git show`` so we never need to check out
    the SHA (the workflow runs on the PR head; the base SHA is
    fetched, not checked out).
    """
    out: dict[str, str] = {}
    listing = _git("ls-tree", "-r", "--name-only", sha, ".gitea/workflows/", cwd=repo_dir)
    for line in listing.splitlines():
        line = line.strip()
        if not line.endswith((".yml", ".yaml")):
            continue
        try:
            blob = _git("show", f"{sha}:{line}", cwd=repo_dir)
        except RuntimeError:
            # Symlink or other non-blob; skip.
            continue
        out[line] = blob
    return out
 # --------------------------------------------------------------------------
 # Gitea: recent commits + per-commit combined status + log fetch
 # --------------------------------------------------------------------------
 def recent_commits_on_branch(branch: str, n: int) -> list[str]:
    """Last `n` commit SHAs on ``branch`` (oldest→newest is fine; we
    treat them as a set). Uses the REST ``/commits`` endpoint with
    ``sha=branch&limit=n``."""
    _, body = api(
        "GET",
        f"/repos/{OWNER}/{NAME}/commits",
        query={"sha": branch, "limit": str(n)},
    )
    if not isinstance(body, list):
        raise ApiError(f"/commits for {branch} returned non-list: {type(body).__name__}")
    out: list[str] = []
    for c in body:
        if isinstance(c, dict):
            sha = c.get("sha") or (c.get("commit", {}) or {}).get("id")
            if isinstance(sha, str) and len(sha) >= 7:
                out.append(sha)
    return out
 def combined_status(sha: str) -> dict:
    """Combined commit status for a SHA. Same shape as
    ``main-red-watchdog.get_combined_status``."""
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(body, dict):
        raise ApiError(f"combined-status for {sha} not a dict")
    return body
 def _entry_state(s: dict) -> str:
    """Per-entry state — Gitea 1.22.6 schema asymmetry: top-level
    uses ``state``, per-entry uses ``status``. Defensive fallback per
    main-red-watchdog.py line 233."""
    return s.get("status") or s.get("state") or ""
 def fetch_log(target_url: str) -> str | None:
    """Fetch a job log given its web-UI ``target_url`` (e.g.
    ``/molecule-ai/molecule-core/actions/runs/13494/jobs/0``).
    Per ``reference_gitea_actions_log_fetch``: append ``/logs`` to the
    job route. Per ``reference_gitea_1_22_6_lacks_rest_rerun_endpoints``:
    Gitea 1.22.6 lacks the REST ``/api/v1/.../actions/runs/*`` path; the
    web-UI route is the only working endpoint until 1.24+.
    Returns the log text on success, ``None`` on 404 / log-pruned /
    network error (caller treats None as "log unavailable, warn-not-fail").
    """
    if not target_url:
        return None
    # Normalize: target_url may be relative ("/owner/repo/...") or
    # absolute. Both need ``/logs`` appended to the job sub-path.
    if target_url.startswith("/"):
        url = f"{WEB}{target_url}"
    else:
        url = target_url
    if not url.endswith("/logs"):
        url = f"{url}/logs"
    try:
        _, body, _ = http("GET", url, expect_json=False, timeout=60)
    except ApiError as e:
        sys.stderr.write(f"::warning::log fetch failed for {url}: {e}\n")
        return None
    if isinstance(body, bytes):
        return body.decode("utf-8", errors="replace")
    return None
 def grep_fail_markers(log_text: str) -> list[str]:
    """Return up to 5 sample matching lines for any FAIL_PATTERNS hit.
    Empty list = clean log."""
    matches: list[str] = []
    for line in log_text.splitlines():
        for pat in FAIL_PATTERNS:
            if pat in line:
                # Truncate to keep error output bounded.
                matches.append(line.strip()[:240])
                break
        if len(matches) >= 5:
            break
    return matches
 # --------------------------------------------------------------------------
 # Verification: for one flip, scan recent runs on BASE_REF
 # --------------------------------------------------------------------------
 def verify_flip(flip: dict, branch: str, n: int) -> dict:
    """Scan the last ``n`` commits on ``branch``. For each commit whose
    combined status contains a context matching ``flip["context"]``,
    fetch the run log and grep for FAIL markers.
    Returns::
        {
          "flip": flip,
          "checked_commits": int,        # how many commits had a matching context
          "masked_runs": [               # runs where log shows FAIL despite status==success
            {"sha": "...", "status": "success", "target_url": "...", "samples": [...]},
            ...
          ],
          "fail_runs": [                 # runs where status itself is failure/error
            {"sha": "...", "status": "failure", "target_url": "...", "samples": [...]},
            ...
          ],
          "warnings": [str],             # log-unavailable warnings (not blocking)
        }
    Blocking condition: ``masked_runs`` OR ``fail_runs`` non-empty.
    A ``success`` status with a clean log is the only "OK to flip"
    outcome (per hongming-pc2 §SOP-N rule (e)).
    """
    target_context = flip["context"]
    result = {
        "flip": flip,
        "checked_commits": 0,
        "masked_runs": [],
        "fail_runs": [],
        "warnings": [],
    }
    shas = recent_commits_on_branch(branch, n)
    if not shas:
        result["warnings"].append(
            f"no recent commits on {branch} (cannot verify flip)"
        )
        return result
    for sha in shas:
        try:
            status_doc = combined_status(sha)
        except ApiError as e:
            result["warnings"].append(f"combined-status for {sha}: {e}")
            continue
        statuses = status_doc.get("statuses") or []
        # First entry matching the context name. Newest SHAs come
        # first; one entry per context per SHA is the usual shape.
        for s in statuses:
            if not isinstance(s, dict):
                continue
            if s.get("context") != target_context:
                continue
            result["checked_commits"] += 1
            state = _entry_state(s)
            target_url = s.get("target_url") or ""
            log_text = fetch_log(target_url)
            if log_text is None:
                result["warnings"].append(
                    f"log unavailable for {sha} {target_context}"
                )
                # Still record the status itself if it's red — that's
                # a hard signal that doesn't need log access.
                if state in ("failure", "error"):
                    result["fail_runs"].append({
                        "sha": sha,
                        "status": state,
                        "target_url": target_url,
                        "samples": ["[log unavailable; status itself is " + state + "]"],
                    })
                break
            samples = grep_fail_markers(log_text)
            if state in ("failure", "error"):
                result["fail_runs"].append({
                    "sha": sha,
                    "status": state,
                    "target_url": target_url,
                    "samples": samples or ["[no FAIL markers found but status is " + state + "]"],
                })
            elif samples and state == "success":
                # The bug class: status==success while log shows FAIL.
                # That's exactly Quirk #10 (continue-on-error masking).
                result["masked_runs"].append({
                    "sha": sha,
                    "status": state,
                    "target_url": target_url,
                    "samples": samples,
                })
            # Either way, we matched one context entry for this SHA;
            # don't keep looping `statuses[]`.
            break
    if result["checked_commits"] == 0:
        result["warnings"].append(
            f"no runs of {target_context!r} found in the last {n} commits on "
            f"{branch} — cannot verify; allowing flip with warning"
        )
    return result
 # --------------------------------------------------------------------------
 # Report rendering
 # --------------------------------------------------------------------------
 def render_flip_report(verdict: dict) -> str:
    flip = verdict["flip"]
    lines = [
        f"job: {flip['job_key']} ({flip['context']})",
        f"  workflow:        {flip['workflow_path']}",
        f"  checked_commits: {verdict['checked_commits']}",
    ]
    for run in verdict["fail_runs"]:
        url = run["target_url"]
        # target_url may be relative; render the absolute form for
        # click-through.
        if url.startswith("/"):
            url = f"{WEB}{url}"
        lines.append(f"  fail run {run['sha'][:10]} (status={run['status']}): {url}")
        for sample in run["samples"]:
            lines.append(f"    | {sample}")
    for run in verdict["masked_runs"]:
        url = run["target_url"]
        if url.startswith("/"):
            url = f"{WEB}{url}"
        lines.append(
            f"  MASKED run {run['sha'][:10]} (status=success, log shows FAIL): {url}"
        )
        for sample in run["samples"]:
            lines.append(f"    | {sample}")
    for w in verdict["warnings"]:
        lines.append(f"  warning: {w}")
    return "\n".join(lines)
 # --------------------------------------------------------------------------
 # Main
 # --------------------------------------------------------------------------
 def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog="lint-pre-flip-continue-on-error",
        description="Block a PR that flips continue-on-error true→false "
        "without proof recent runs are actually green.",
    )
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Detect + print findings to stdout; never exit non-zero. "
        "Useful for local testing.",
    )
    return p.parse_args(argv)
 def main(argv: list[str] | None = None) -> int:
    args = _parse_args(argv)
    _require_runtime_env()
    base_workflows = workflows_at_sha(BASE_SHA)
    head_workflows = workflows_at_sha(HEAD_SHA)
    flips = detect_flips(base_workflows, head_workflows)
    if not flips:
        print("::notice::no continue-on-error true→false flips in this PR")
        return 0
    print(f"::notice::detected {len(flips)} continue-on-error true→false flip(s); verifying recent runs on {BASE_REF}")
    bad_flips: list[dict] = []
    for flip in flips:
        verdict = verify_flip(flip, BASE_REF, RECENT_COMMITS_N)
        report = render_flip_report(verdict)
        if verdict["fail_runs"] or verdict["masked_runs"]:
            print(f"::error file={flip['workflow_path']}::flip of {flip['job_key']} "
                  f"({flip['context']}) blocked — recent runs on {BASE_REF} show "
                  f"FAIL markers OR are red. Pull each run log below + grep "
                  f"`--- FAIL` / `FAIL ` / `::error::` — DON'T trust the masked "
                  f"combined-status. See hongming-pc2 charter §SOP-N rule (e). "
                  f"PR#656 / mc#664 reference class.")
            bad_flips.append(verdict)
        else:
            print(f"::notice::flip of {flip['job_key']} ({flip['context']}) is safe — "
                  f"{verdict['checked_commits']} recent run(s), no FAIL markers")
        # Always print the per-flip detail block so the human-readable
        # report is in the run log for both safe and unsafe flips.
        print(f"::group::flip detail: {flip['job_key']}")
        print(report)
        print("::endgroup::")
    if bad_flips and not args.dry_run:
        print(f"::error::{len(bad_flips)}/{len(flips)} flip(s) failed pre-flip verification")
        return 1
    if bad_flips and args.dry_run:
        print(f"::warning::[dry-run] {len(bad_flips)}/{len(flips)} flip(s) WOULD fail; exit 0 forced")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/main-red-watchdog.py
+++ b/.gitea/scripts/main-red-watchdog.py
@ -1,606 +0,0 @@
 #!/usr/bin/env python3
 """main-red-watchdog — Option C of the "main NEVER goes red" directive.
 Tracking: molecule-core#420.
 What it does (one cron tick):
  1. GET /api/v1/repos/{owner}/{repo}/branches/{watch_branch}
     → current HEAD SHA on the watched branch.
  2. GET /api/v1/repos/{owner}/{repo}/commits/{SHA}/status
     → combined status + per-context statuses.
  3. If combined state is `failure` (or any individual status is
     `failure`): open or PATCH an idempotent
     `[main-red] {repo}: {SHA[:10]}` issue. Body lists each failed
     status context with `target_url` + `description`.
  4. If combined state is `success`: close any open `[main-red]
     {repo}: ...` issue on a previous SHA with a
     "main returned to green at SHA {current_SHA}" comment.
  5. Emit one Loki-shaped JSON line via `logger -t main-red-watchdog`
     so `reference_obs_stack_phase1`'s Vector → Loki path ingests an
     alert event (queryable in Grafana as
     `{tenant="operator-host"} |~ "main-red-watchdog"`).
 What it does NOT do:
  - Auto-revert anything. Option B is explicitly rejected per
    `feedback_no_such_thing_as_flakes` + `feedback_fix_root_not_symptom`.
  - Page on its own failures. If api() raises ApiError (transient
    Gitea outage), the workflow run fails LOUDLY by re-raise — exactly
    the contract `feedback_api_helper_must_raise_not_return_dict`
    enforces. Silent fallthrough would re-introduce the duplicate-issue
    regression class.
  - Exit non-zero on RED. The issue IS the alarm; failing the watchdog
    on red would double-page (red workflow + open issue) and create
    silent-loop risk if the watchdog itself flakes.
 Idempotency strategy:
  Title is keyed on `{SHA[:10]}` (commit-scoped), NOT just `main`.
  Rationale:
    - A fix-forward changes HEAD → next cron tick sees a new SHA;
      auto-close logic closes the prior `[main-red] OLD_SHA` issue and
      (if the new HEAD is also red, e.g. a different test fails) files
      a fresh `[main-red] NEW_SHA`. Lineage is preserved.
    - A revert that happens to land back on a previously-red SHA
      (rare) would refer to a CLOSED issue; the watchdog never reopens.
      That's a deliberate trade-off — the operator will see the latest
      open issue's `closed` event in the activity feed.
 This module is import-safe: tests import individual functions without
 invoking main(), so module-level reads use env-with-default and the
 runtime contract enforcement lives in `_require_runtime_env()`.
 Run locally (dry-run, no API mutation):
    GITEA_TOKEN=... GITEA_HOST=git.moleculesai.app REPO=owner/repo \\
      WATCH_BRANCH=main RED_LABEL=tier:high \\
      python3 .gitea/scripts/main-red-watchdog.py --dry-run
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import shutil
 import subprocess
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from typing import Any
 # --------------------------------------------------------------------------
 # Environment
 # --------------------------------------------------------------------------
 def _env(key: str, *, default: str = "") -> str:
    """Read an env var with a default. Module-import-safe — tests can
    import this script without setting the full env contract."""
    return os.environ.get(key, default)
 GITEA_TOKEN = _env("GITEA_TOKEN")
 GITEA_HOST = _env("GITEA_HOST")
 REPO = _env("REPO")
 WATCH_BRANCH = _env("WATCH_BRANCH", default="main")
 RED_LABEL = _env("RED_LABEL", default="tier:high")
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
 # Title prefix — kept short and stable so the idempotency search can
 # match by exact title without parsing.
 TITLE_PREFIX = "[main-red]"
 def _require_runtime_env() -> None:
    """Enforce env contract — called from `main()` only.
    Tests import individual functions without setting the full env
    contract. Mirrors the CP `ci-required-drift.py` pattern so the
    runtime guard is a single chokepoint.
    """
    for key in ("GITEA_TOKEN", "GITEA_HOST", "REPO", "WATCH_BRANCH", "RED_LABEL"):
        if not os.environ.get(key):
            sys.stderr.write(f"::error::missing required env var: {key}\n")
            sys.exit(2)
 # --------------------------------------------------------------------------
 # Tiny HTTP helper — raises on non-2xx + on JSON-decode-of-expected-JSON.
 # --------------------------------------------------------------------------
 class ApiError(RuntimeError):
    """Raised when a Gitea API call cannot be trusted to have succeeded.
    Covers non-2xx HTTP status AND 2xx with an unparseable JSON body on
    endpoints documented to return JSON. Callers that swallow this and
    proceed risk e.g. creating duplicate `[main-red]` issues when a
    transient 500 hides an existing match. Per
    `feedback_api_helper_must_raise_not_return_dict`: soft-failure is
    opt-in via `expect_json=False`, never the default.
    """
 def api(
    method: str,
    path: str,
    *,
    body: dict | None = None,
    query: dict[str, str] | None = None,
    expect_json: bool = True,
 ) -> tuple[int, Any]:
    """Tiny HTTP helper around urllib.
    Raises ApiError on any non-2xx response, and on JSON-decode failure
    when `expect_json=True` (the default for read-shaped paths). Mirrors
    the CP ci-required-drift.py contract exactly so behaviour is
    cross-checkable.
    """
    url = f"{API}{path}"
    if query:
        url = f"{url}?{urllib.parse.urlencode(query)}"
    data = None
    headers = {
        "Authorization": f"token {GITEA_TOKEN}",
        "Accept": "application/json",
    }
    if body is not None:
        data = json.dumps(body).encode("utf-8")
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, method=method, data=data, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = resp.read()
            status = resp.status
    except urllib.error.HTTPError as e:
        raw = e.read()
        status = e.code
    if not (200 <= status < 300):
        snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
        raise ApiError(f"{method} {path} → HTTP {status}: {snippet}")
    if not raw:
        return status, None
    try:
        return status, json.loads(raw)
    except json.JSONDecodeError as e:
        if expect_json:
            raise ApiError(
                f"{method} {path} → HTTP {status} but body is not JSON: {e}"
            ) from e
        # Opt-in raw fallthrough for endpoints with known echo-quirks
        # (`feedback_gitea_create_api_unparseable_response`). Caller
        # MUST verify success via a follow-up GET, not by trusting body.
        return status, {"_raw": raw.decode("utf-8", errors="replace")}
 # --------------------------------------------------------------------------
 # Gitea reads
 # --------------------------------------------------------------------------
 def get_head_sha(branch: str) -> str:
    """HEAD SHA of `branch`. Raises ApiError on non-2xx."""
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}")
    if not isinstance(body, dict):
        raise ApiError(f"branch {branch} response not a JSON object")
    commit = body.get("commit")
    if not isinstance(commit, dict):
        raise ApiError(f"branch {branch} response missing `commit` object")
    sha = commit.get("id") or commit.get("sha")
    if not isinstance(sha, str) or len(sha) < 7:
        raise ApiError(f"branch {branch} response has no usable commit SHA")
    return sha
 def get_combined_status(sha: str) -> dict:
    """Combined commit status for `sha`. Gitea returns:
        {
          "state": "success" | "failure" | "pending" | "error",
          "statuses": [
            {"context": "...", "state": "success|failure|pending|error",
             "target_url": "...", "description": "..."},
            ...
          ],
          ...
        }
    Raises ApiError on non-2xx.
    """
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(body, dict):
        raise ApiError(f"status for {sha} response not a JSON object")
    return body
 def is_red(status: dict) -> tuple[bool, list[dict]]:
    """Return (is_red, failed_statuses).
    A commit is "red" if combined state is `failure` OR any individual
    status entry is in {`failure`, `error`}. `pending` and `success`
    do not trip the watchdog — pending means CI is still running, and
    that's the normal state immediately after a merge.
    `failed_statuses` is the list of per-context entries whose own
    `state` is in the red set; useful for the issue body.
    """
    combined = status.get("state")
    statuses = status.get("statuses") or []
    red_states = {"failure", "error"}
    # Schema asymmetry: top-level combined uses `state`, but per-entry
    # items in `statuses[]` use `status` in Gitea 1.22.6. Prefer
    # `status`; fall back to `state` defensively. Verified empirically
    # 2026-05-12 03:42Z. Pre-rev4 code only read `state` from per-entry
    # items → failed[] always empty → render_body always showed the
    # "no per-context entries were in a red state" fallback even when
    # the combined-state correctly flagged red. See
    # `feedback_smoke_test_vendor_truth_not_shape_match`.
    def _entry_state(s: dict) -> str:
        return s.get("status") or s.get("state") or ""
    failed = [
        s for s in statuses
        if isinstance(s, dict) and _entry_state(s) in red_states
    ]
    return (combined in red_states or bool(failed), failed)
 # --------------------------------------------------------------------------
 # Issue file / update / close
 # --------------------------------------------------------------------------
 def title_for(sha: str) -> str:
    """Idempotency key — `[main-red] {repo}: {SHA[:10]}`.
    Commit-scoped. A fix-forward to a new SHA produces a new title; the
    prior issue auto-closes via `close_open_red_issues_for_other_shas`.
    """
    return f"{TITLE_PREFIX} {REPO}: {sha[:10]}"
 def list_open_red_issues() -> list[dict]:
    """All open issues whose title starts with `[main-red] {repo}: `.
    Per Five-Axis review on CP#112 (`feedback_api_helper_must_raise_not_return_dict`):
    api() raises on non-2xx; we let it propagate. Returning [] on a
    transient 500 would cause auto-close to skip the cleanup AND the
    file-or-update path to POST a duplicate — exactly the regression
    class the helper-raises contract closes.
    Gitea issue search returns at most 50/page; we only need open
    `[main-red]` issues which are by design ≤ 1 at any time per repo,
    so a single page is enough.
    """
    _, results = api(
        "GET",
        f"/repos/{OWNER}/{NAME}/issues",
        query={"state": "open", "type": "issues", "limit": "50"},
    )
    if not isinstance(results, list):
        raise ApiError(
            f"issue search returned non-list body (got {type(results).__name__})"
        )
    prefix = f"{TITLE_PREFIX} {REPO}: "
    return [i for i in results if isinstance(i, dict)
            and isinstance(i.get("title"), str)
            and i["title"].startswith(prefix)]
 def find_open_issue_for_sha(sha: str) -> dict | None:
    """Return the existing open `[main-red] {repo}: {SHA[:10]}` issue,
    or None if no such issue is open.
    `None` means "search succeeded, no match" — NOT "search failed".
    api() raises ApiError on any non-2xx; the caller can let that
    propagate so a transient outage fails loudly instead of silently
    duplicating.
    """
    target = title_for(sha)
    for issue in list_open_red_issues():
        if issue.get("title") == target:
            return issue
    return None
 def render_body(sha: str, failed: list[dict], debug: dict) -> str:
    """Issue body. Markdown. Mirrors CP#112's render_body shape."""
    lines = [
        f"# Main is RED on `{REPO}` at `{sha[:10]}`",
        "",
        f"Commit: <https://{GITEA_HOST}/{REPO}/commit/{sha}>",
        "",
        "Auto-filed by `.gitea/workflows/main-red-watchdog.yml` (Option C "
        "of the [main-never-red directive]"
        f"(https://{GITEA_HOST}/molecule-ai/molecule-core/issues/420)). "
        "Per `feedback_no_such_thing_as_flakes` + "
        "`feedback_fix_root_not_symptom`: investigate the root cause; do "
        "NOT revert as a reflex. The watchdog itself never reverts.",
        "",
        "## Failed status contexts",
        "",
    ]
    if not failed:
        lines.append(
            "_(Combined state reported `failure`/`error` but no per-context "
            "entries were in a red state. This usually means a CI emitter "
            "set combined-status directly without a per-context status. "
            "Check the most recent workflow run for `main` and trace from "
            "there.)_"
        )
    else:
        for s in failed:
            ctx = s.get("context", "(no context)")
            # Per-entry key is `status` in Gitea 1.22.6, not `state`
            # (see _entry_state in is_red). Fallback for forward-compat.
            state = s.get("status") or s.get("state") or "(no state)"
            url = s.get("target_url") or ""
            desc = (s.get("description") or "").strip()
            entry = f"- **{ctx}** — `{state}`"
            if url:
                entry += f" → [logs]({url})"
            if desc:
                entry += f"\n  - {desc}"
            lines.append(entry)
    lines.extend([
        "",
        "## Resolution path",
        "",
        "1. Read the failed logs (links above).",
        "2. If reproducible locally, fix forward in a PR targeting `main`.",
        "3. If the failure is a real flake — STOP. Per "
        "`feedback_no_such_thing_as_flakes`, intermittent failures are "
        "real bugs. Investigate to root cause; do not mark as flake.",
        "4. If the failure is blocking unrelated work for >1 hour, file a "
        "follow-up issue and assign someone. Do NOT revert without a "
        "human GO per `feedback_prod_apply_needs_hongming_chat_go` "
        "(branch protection is a prod surface).",
        "",
        "## Debug",
        "",
        "```json",
        json.dumps(debug, indent=2, sort_keys=True),
        "```",
        "",
        "_This issue is idempotent: the watchdog runs hourly at `:05` "
        "and edits this body in place. When `main` returns to green, the "
        "watchdog will close this issue automatically with a "
        "\"main returned to green\" comment._",
    ])
    return "\n".join(lines)
 def emit_loki_event(event_type: str, sha: str, failed_contexts: list[str]) -> None:
    """Emit a JSON line to syslog tag `main-red-watchdog` for
    `reference_obs_stack_phase1` (Vector → Loki).
    Best-effort: if `logger` isn't on PATH (e.g. local dev macOS without
    util-linux logger), print to stderr instead. The Gitea Actions
    Ubuntu runner has util-linux preinstalled.
    Loki labels: the workflow runs on the Ubuntu runner where Vector is
    NOT configured (Vector lives on the operator host + tenants per
    `reference_obs_stack_phase1`). The Loki line is still emitted as
    stdout JSON so the workflow log itself is parseable; treat the
    syslog call as belt-and-braces for the cases where this script is
    invoked from a host that DOES have Vector (e.g. operator-host cron
    fallback in a follow-up PR).
    """
    payload = {
        "event_type": event_type,
        "repo": REPO,
        "sha": sha,
        "failed_contexts": failed_contexts,
    }
    line = json.dumps(payload, sort_keys=True)
    # Always print to stdout so the workflow log captures it (machine-
    # readable; `gitea run logs` + Loki ingestion via the operator-host
    # journald → Vector → Loki path will see this from runners that
    # forward stdout). Loki query:
    #   {source="gitea-actions"} |~ "main_red_detected"
    print(f"main-red-watchdog event: {line}")
    # Best-effort syslog tag so a future "run from operator-host cron"
    # path picks it up directly via the existing Vector pipeline.
    if shutil.which("logger"):
        try:
            subprocess.run(
                ["logger", "-t", "main-red-watchdog", line],
                check=False,
                timeout=5,
            )
        except (OSError, subprocess.SubprocessError) as e:
            sys.stderr.write(f"::warning::logger call failed: {e}\n")
 def file_or_update_red(
    sha: str,
    failed: list[dict],
    debug: dict,
    *,
    dry_run: bool = False,
 ) -> None:
    """Open a new `[main-red] {repo}: {SHA[:10]}` issue, or PATCH the
    existing one's body. Idempotent by title."""
    title = title_for(sha)
    body = render_body(sha, failed, debug)
    if dry_run:
        print(f"::notice::[dry-run] would file/update main-red issue for {sha[:10]}")
        print("::group::[dry-run] title")
        print(title)
        print("::endgroup::")
        print("::group::[dry-run] body")
        print(body)
        print("::endgroup::")
        return
    existing = find_open_issue_for_sha(sha)
    if existing:
        num = existing["number"]
        api("PATCH", f"/repos/{OWNER}/{NAME}/issues/{num}", body={"body": body})
        print(f"::notice::Updated existing main-red issue #{num} for {sha[:10]}")
        return
    _, created = api(
        "POST",
        f"/repos/{OWNER}/{NAME}/issues",
        body={"title": title, "body": body, "labels": []},
    )
    if not isinstance(created, dict):
        raise ApiError("POST issue response not a JSON object")
    new_num = created.get("number")
    print(f"::warning::Filed new main-red issue #{new_num} for {sha[:10]}")
    # Apply RED_LABEL by id. Gitea's add-labels endpoint takes IDs, not
    # names (`feedback_gitea_label_delete_by_id` — same rule for add).
    # Best-effort: label failure is logged but does not fail the run.
    try:
        _, labels = api("GET", f"/repos/{OWNER}/{NAME}/labels")
    except ApiError as e:
        sys.stderr.write(f"::warning::could not list labels: {e}\n")
        return
    label_id = None
    if isinstance(labels, list):
        for lbl in labels:
            if isinstance(lbl, dict) and lbl.get("name") == RED_LABEL:
                label_id = lbl.get("id")
                break
    if label_id is not None and new_num:
        try:
            api(
                "POST",
                f"/repos/{OWNER}/{NAME}/issues/{new_num}/labels",
                body={"labels": [label_id]},
            )
        except ApiError as e:
            sys.stderr.write(
                f"::warning::could not apply label '{RED_LABEL}' to #{new_num}: {e}\n"
            )
    else:
        sys.stderr.write(f"::warning::label '{RED_LABEL}' not found on repo\n")
 def close_open_red_issues_for_other_shas(
    current_sha: str,
    *,
    dry_run: bool = False,
 ) -> int:
    """When main is green at current_sha, close any open `[main-red]`
    issues whose title references a different SHA. Returns the number
    of issues closed.
    Lineage note: we only close issues whose title prefix matches; if
    a human renamed the issue or added a suffix this won't touch it.
    That's intentional — manual editorial state takes precedence.
    """
    target_title = title_for(current_sha)
    open_red = list_open_red_issues()
    closed = 0
    for issue in open_red:
        if issue.get("title") == target_title:
            # Same SHA — caller should not have invoked this if main is
            # green. Skip defensively.
            continue
        num = issue.get("number")
        if not isinstance(num, int):
            continue
        comment = (
            f"`main` returned to green at SHA `{current_sha}` "
            f"(<https://{GITEA_HOST}/{REPO}/commit/{current_sha}>). "
            "Closing automatically. If the underlying root cause is "
            "not yet understood, reopen this issue and file a "
            "postmortem — green-by-flake is still a bug per "
            "`feedback_no_such_thing_as_flakes`."
        )
        if dry_run:
            print(f"::notice::[dry-run] would close issue #{num} ({issue.get('title')})")
            closed += 1
            continue
        # Comment first, then close. Order matters: a closed issue can
        # still receive comments, but the activity-feed ordering reads
        # better with the explanation arriving just before the close.
        api(
            "POST",
            f"/repos/{OWNER}/{NAME}/issues/{num}/comments",
            body={"body": comment},
        )
        api(
            "PATCH",
            f"/repos/{OWNER}/{NAME}/issues/{num}",
            body={"state": "closed"},
        )
        print(f"::notice::Closed main-red issue #{num} (green at {current_sha[:10]})")
        closed += 1
    return closed
 # --------------------------------------------------------------------------
 # Main
 # --------------------------------------------------------------------------
 def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog="main-red-watchdog",
        description="Detect post-merge CI red on the watched branch and "
        "file an idempotent issue. Option C of the main-never-red directive.",
    )
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Detect + print the would-be issue title/body to stdout; do "
        "NOT POST/PATCH/close any issues. Useful for local testing.",
    )
    return p.parse_args(argv)
 def run_once(*, dry_run: bool = False) -> int:
    """One watchdog tick. Returns 0 on green or red-issue-filed; lets
    ApiError propagate on transient outage (workflow run fails loudly,
    which is correct per the helper-raises contract)."""
    sha = get_head_sha(WATCH_BRANCH)
    status = get_combined_status(sha)
    red, failed = is_red(status)
    debug = {
        "branch": WATCH_BRANCH,
        "sha": sha,
        "combined_state": status.get("state"),
        "failed_contexts": [s.get("context") for s in failed],
        "all_contexts": [
            # Per-entry key is `status` in Gitea 1.22.6, not `state`.
            # Pre-rev4 debug output reported `state: None` for every
            # context, making run logs useless for triage.
            {"context": s.get("context"),
             "state": s.get("status") or s.get("state")}
            for s in (status.get("statuses") or [])
            if isinstance(s, dict)
        ],
    }
    if red:
        failed_ctxs = [s.get("context") for s in failed if s.get("context")]
        emit_loki_event("main_red_detected", sha, failed_ctxs)
        print(f"::warning::main is RED at {sha[:10]} on {WATCH_BRANCH}: "
              f"{len(failed)} failed context(s)")
        file_or_update_red(sha, failed, debug, dry_run=dry_run)
    else:
        # Green (or pending — pending is treated as not-red so we don't
        # spam during the post-merge CI window). Close any stale issues
        # from earlier SHAs only when we're actually green; pending
        # means CI hasn't finished and the prior issue might still be
        # accurate.
        if status.get("state") == "success":
            closed = close_open_red_issues_for_other_shas(sha, dry_run=dry_run)
            if closed:
                emit_loki_event(
                    "main_returned_to_green", sha,
                    [],
                )
            print(f"::notice::main is GREEN at {sha[:10]} on {WATCH_BRANCH} "
                  f"(closed {closed} stale issue(s))")
        else:
            print(f"::notice::main is PENDING at {sha[:10]} on {WATCH_BRANCH} "
                  f"(combined state={status.get('state')!r}; no action)")
    return 0
 def main(argv: list[str] | None = None) -> int:
    args = _parse_args(argv)
    _require_runtime_env()
    return run_once(dry_run=args.dry_run)
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/push-commits-diff-files.py
+++ b/.gitea/scripts/push-commits-diff-files.py
@ -1,42 +0,0 @@
 #!/usr/bin/env python3
 """Extract changed-file list from a Gitea push event's commits JSON array.
 Each commit in a push event has `added`, `removed`, and `modified` file lists.
 This script aggregates all of them and prints unique filenames one per line.
 Usage:
    push-commits-diff-files.py < COMMITS_JSON
 Exits 0 always (caller handles empty output as "no files").
 """
 from __future__ import annotations
 import sys
 import json
 def main() -> None:
    try:
        data = json.load(sys.stdin)
    except Exception:
        sys.exit(0)  # Don't fail the step — treat malformed JSON as empty
    if not isinstance(data, list):
        sys.exit(0)
    files: set[str] = set()
    for commit in data:
        if not isinstance(commit, dict):
            continue
        for key in ("added", "removed", "modified"):
            for f in commit.get(key) or []:
                if isinstance(f, str) and f:
                    files.add(f)
    if files:
        sys.stdout.write("\n".join(sorted(files)))
        sys.stdout.write("\n")
 if __name__ == "__main__":
    main()
--- a/.gitea/scripts/review-check.sh
+++ b/.gitea/scripts/review-check.sh
@ -1,203 +0,0 @@
 #!/usr/bin/env bash
 # review-check — evaluate whether a PR satisfies a single team-review gate.
 #
 # RFC#324 Step 1 of 5 — qa-review + security-review check workflows.
 #
 # This is the shared evaluator invoked by:
 #   .gitea/workflows/qa-review.yml      (TEAM=qa,      TEAM_ID=20)
 #   .gitea/workflows/security-review.yml (TEAM=security, TEAM_ID=21)
 #
 # Pass condition (per RFC#324 v1.1 addendum):
 #   ≥ 1 review on the PR where:
 #     • state == APPROVED
 #     • review.dismissed == false
 #     • review.user.login != PR.user.login (non-author)
 #     • review.user.login ∈ team-members
 #
 # Strict mode (default OFF for v1; see RFC trade-off note):
 #   If REVIEW_CHECK_STRICT=1, additionally require review.commit_id == PR.head.sha.
 #   With dismiss_stale_reviews: true at the protection layer, stale reviews
 #   are already dismissed, so the additional commit_id check is belt-and-
 #   suspenders. Keeping it off in v1 simplifies semantics; flip in a follow-up
 #   PR if reviewer telemetry shows residual stale-APPROVE merges.
 #
 # Privilege gate (RFC#324 v1.3 §A1.1 — INFORMATIONAL ONLY):
 #   The /qa-recheck and /security-recheck slash-commands can be triggered
 #   by anyone who can comment on the PR. The workflow's privilege step
 #   logs collaborator-status but does NOT gate execution of this script.
 #   Why this is safe: this evaluator is read-only and idempotent —
 #   reading `pulls/{N}/reviews` and `teams/{id}/members/{u}` can't be
 #   influenced by who triggered the run. If a real team-member APPROVE
 #   exists the gate flips green; otherwise it stays red. A
 #   non-collaborator commenting /qa-recheck cannot manufacture a green
 #   gate. Original (v1.2) design with `if:`-gating of this step was
 #   fail-open (skipped-via-`if:` job still publishes the status as
 #   `success`) — corrected in v1.3 per hongming-pc review 1421.
 #
 # Trust boundary (RFC A4):
 #   This script is loaded from the BASE branch (sourced via .gitea/scripts/
 #   on the workflow's checkout-of-base). It does NOT execute any PR-HEAD
 #   code. It only reads PR review state via the Gitea API.
 #
 # Token scope (RFC A1-α):
 #   The job's own conclusion (exit 0 / exit 1) is what publishes the
 #   `qa-review / approved` / `security-review / approved` status context.
 #   NO `POST /statuses` call here → NO `write:repository` scope on the
 #   token. `read:organization` (for team-membership probe) and
 #   `read:repository` (for PR + reviews) are enough.
 #
 # Required env:
 #   GITEA_TOKEN — least-priv read:repository + read:organization. See note
 #                 below about the team-membership API requiring the token
 #                 owner to be in the queried team (Gitea 1.22.6 quirk).
 #   GITEA_HOST  — e.g. git.moleculesai.app
 #   REPO        — owner/name (from github.repository)
 #   PR_NUMBER   — int (from github.event.pull_request.number or
 #                 github.event.issue.number for issue_comment events)
 #   TEAM        — short team name (qa | security) for log lines
 #   TEAM_ID     — Gitea team id (20=qa, 21=security at time of writing)
 #
 # Optional:
 #   REVIEW_CHECK_DEBUG=1 — per-API-call diagnostic lines
 #   REVIEW_CHECK_STRICT=1 — also require review.commit_id == pr.head.sha
 set -euo pipefail
 # jq is required for JSON parsing. It is pre-baked into the runner-base
 # image (per RFC#268 workflow-smoke), so the only reason we'd not find it
 # is a broken runner. The previous fallback dance (apt-get + curl to
 # /usr/local/bin/jq) cannot succeed on a uid-1001 rootless runner
 # (#391/#402 + feedback_ci_runner_install_needs_writable_path), so it's
 # dropped. Fail loud with a clear diagnostic rather than attempt an
 # install that physically cannot work.
 if ! command -v jq >/dev/null 2>&1; then
  echo "::error::jq missing from runner-base image — bake it into the runner image (see RFC#268 workflow-smoke / feedback_ci_runner_install_needs_writable_path). This evaluator cannot run without jq."
  exit 1
 fi
 : "${GITEA_TOKEN:?GITEA_TOKEN required}"
 : "${GITEA_HOST:?GITEA_HOST required}"
 : "${REPO:?REPO required (owner/name)}"
 : "${PR_NUMBER:?PR_NUMBER required}"
 : "${TEAM:?TEAM required (qa|security)}"
 : "${TEAM_ID:?TEAM_ID required (integer)}"
 OWNER="${REPO%%/*}"
 NAME="${REPO##*/}"
 API="https://${GITEA_HOST}/api/v1"
 # Token-in-argv fix (#541): write the Authorization header to a mode-600
 # temp file instead of passing it via curl -H "$AUTH" (which puts the
 # secret token value in the process table for any process to read via
 # /proc/<pid>/cmdline or ps -ef). The curl config file is read by curl
 # itself and never appears in the argv of the curl subprocess.
 CURL_AUTH_FILE=$(mktemp -p /tmp curl-auth.XXXXXX)
 chmod 600 "$CURL_AUTH_FILE"
 printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$CURL_AUTH_FILE"
 # Pre-create temp files so cleanup trap can reference them by name
 # (bash trap 'function' EXIT expands variables at trap-fire time, not def time).
 PR_JSON=$(mktemp)
 REVIEWS_JSON=$(mktemp)
 TEAM_PROBE_TMP=$(mktemp)
 cleanup() {
  rm -f "$CURL_AUTH_FILE" "$PR_JSON" "$REVIEWS_JSON" "$TEAM_PROBE_TMP"
 }
 trap cleanup EXIT
 debug() {
  if [ "${REVIEW_CHECK_DEBUG:-}" = "1" ]; then
    echo "  [debug] $*" >&2
  fi
 }
 echo "::notice::${TEAM}-review evaluating repo=${OWNER}/${NAME} pr=${PR_NUMBER} team_id=${TEAM_ID}"
 # --- Fetch the PR (for author + head.sha) ---
 HTTP_CODE=$(curl -sS -o "$PR_JSON" -w '%{http_code}' \
  -K "$CURL_AUTH_FILE" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
 if [ "$HTTP_CODE" != "200" ]; then
  echo "::error::GET /pulls/${PR_NUMBER} returned HTTP ${HTTP_CODE} (token scope?)"
  cat "$PR_JSON" >&2
  exit 1
 fi
 PR_AUTHOR=$(jq -r '.user.login // ""' "$PR_JSON")
 PR_HEAD_SHA=$(jq -r '.head.sha // ""' "$PR_JSON")
 PR_STATE=$(jq -r '.state // ""' "$PR_JSON")
 debug "pr_author=${PR_AUTHOR} pr_head=${PR_HEAD_SHA:0:7} pr_state=${PR_STATE}"
 if [ "$PR_STATE" != "open" ]; then
  echo "::notice::PR ${PR_NUMBER} is ${PR_STATE} — exiting 0 (closed PRs do not gate)"
  exit 0
 fi
 if [ -z "$PR_AUTHOR" ] || [ -z "$PR_HEAD_SHA" ]; then
  echo "::error::PR ${PR_NUMBER} missing user.login or head.sha — webhook payload malformed"
  exit 1
 fi
 # --- Fetch all reviews on the PR ---
 HTTP_CODE=$(curl -sS -o "$REVIEWS_JSON" -w '%{http_code}' \
  -K "$CURL_AUTH_FILE" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
 if [ "$HTTP_CODE" != "200" ]; then
  echo "::error::GET /pulls/${PR_NUMBER}/reviews returned HTTP ${HTTP_CODE}"
  cat "$REVIEWS_JSON" >&2
  exit 1
 fi
 # Filter: state=APPROVED, not-dismissed, non-author. Optionally strict-mode
 # adds commit_id==head.sha (off by default; see header).
 JQ_FILTER='.[]
  | select(.state == "APPROVED")
  | select(.dismissed != true)
  | select(.user.login != $author)'
 if [ "${REVIEW_CHECK_STRICT:-}" = "1" ]; then
  JQ_FILTER="${JQ_FILTER}
  | select(.commit_id == \$head)"
 fi
 JQ_FILTER="${JQ_FILTER}
  | .user.login"
 CANDIDATES=$(jq -r --arg author "$PR_AUTHOR" --arg head "$PR_HEAD_SHA" "$JQ_FILTER" "$REVIEWS_JSON" | sort -u)
 debug "candidate non-author approvers: $(echo "$CANDIDATES" | tr '\n' ' ')"
 if [ -z "$CANDIDATES" ]; then
  echo "::error::${TEAM}-review awaiting non-author APPROVE from ${TEAM} team (no candidates yet)"
  exit 1
 fi
 # --- Probe team membership per candidate ---
 # Endpoint: GET /api/v1/teams/{id}/members/{username}
 #   200/204 → is member
 #   403     → token owner is not in this team (Gitea 1.22.6 'Must be a team
 #             member' constraint — see follow-up issue for token-provisioning)
 #   404     → not a member
 for U in $CANDIDATES; do
  CODE=$(curl -sS -o "$TEAM_PROBE_TMP" -w '%{http_code}' \
    -K "$CURL_AUTH_FILE" "${API}/teams/${TEAM_ID}/members/${U}")
  debug "probe ${U} in team ${TEAM} (id=${TEAM_ID}) → HTTP ${CODE}"
  case "$CODE" in
    200|204)
      echo "::notice::${TEAM}-review APPROVED by ${U} (team=${TEAM})"
      exit 0
      ;;
    403)
      # Token owner is not in the team being probed; the API refuses to
      # confirm membership. This is the RFC#324 follow-up token-scope gap.
      # Fail closed — never grant approval on a 403; surface clearly.
      echo "::error::team-probe for ${U} in ${TEAM} returned 403 (token owner not in ${TEAM} team — RFC#324 token-scope follow-up). Cannot confirm membership; failing closed."
      cat "$TEAM_PROBE_TMP" >&2
      exit 1
      ;;
    404)
      debug "${U} not a member of ${TEAM}"
      ;;
    *)
      echo "::warning::team-probe for ${U} in ${TEAM} returned unexpected HTTP ${CODE}"
      cat "$TEAM_PROBE_TMP" >&2
      ;;
  esac
 done
 echo "::error::${TEAM}-review awaiting non-author APPROVE from ${TEAM} team (candidates: $(echo "$CANDIDATES" | tr '\n' ',' | sed 's/,$//') — none are in team)"
 exit 1
--- a/.gitea/scripts/sop-checklist-gate.py
+++ b/.gitea/scripts/sop-checklist-gate.py
@ -1,823 +0,0 @@
 #!/usr/bin/env python3
 # sop-checklist-gate — evaluate whether a PR has peer-acked each
 # SOP-checklist item. Posts a commit-status that branch protection
 # can require.
 #
 # RFC#351 Step 2 of 6 (implementation MVP).
 #
 # Invoked by .gitea/workflows/sop-checklist-gate.yml on:
 #   - pull_request_target: [opened, edited, synchronize, reopened]
 #   - issue_comment:       [created, edited, deleted]
 #
 # Flow:
 #   1. Load .gitea/sop-checklist-config.yaml (from BASE ref — trusted).
 #   2. GET /repos/{R}/pulls/{N}          — author, head.sha, tier label
 #   3. GET /repos/{R}/issues/{N}/comments — extract /sop-ack and /sop-revoke
 #   4. For each checklist item:
 #        a. Is the section marker present in PR body? (author answered)
 #        b. Is there ≥1 unrevoked /sop-ack from a non-author whose
 #           team-membership matches required_teams?
 #   5. POST /repos/{R}/statuses/{sha}    — context
 #      `sop-checklist / all-items-acked (pull_request)`,
 #      state=success | failure | pending, description=`acked: N/M …`.
 #
 # Trust boundary (mirrors RFC#324 §A4):
 #   This script is loaded from the BASE branch. The workflow's
 #   actions/checkout step pins ref=base.sha. PR-HEAD code is never
 #   executed. We only HTTP-call the Gitea API.
 #
 # Token scope:
 #   - read:repository / read:organization to enumerate PR + comments
 #     + team membership (Gitea 1.22.6 quirk: team-membership endpoint
 #     returns 403 if token owner is not in the team; see review-check.sh
 #     for the same gotcha — we surface the same fail-closed message).
 #   - write:repository for `POST /repos/{R}/statuses/{sha}`. Unlike
 #     RFC#324's pattern (which uses the JOB's own pass/fail as the
 #     status), we POST the status explicitly because the gate posts
 #     a single multi-item status with a richer description than a
 #     bare success/failure context can carry.
 #
 # Slug normalization rules (canonical form: kebab-case):
 #   - Lowercase
 #   - Whitespace + underscores → single dash
 #   - Strip non [a-z0-9-] characters
 #   - Collapse adjacent dashes
 #   - Strip leading/trailing dashes
 #   - If the result is a digit string (e.g. "1"), look up via
 #     config.items[*].numeric_alias to get the kebab-case slug.
 #
 #   Examples:
 #       "Comprehensive_Testing"  → "comprehensive-testing"
 #       "comprehensive testing"  → "comprehensive-testing"
 #       "1"                      → "comprehensive-testing"
 #       "Five-Axis-Review"       → "five-axis-review"
 #
 # Revoke semantics:
 #   /sop-revoke <slug> [reason] — most-recent comment per (slug, user)
 #   wins. So if Alice posts /sop-ack X then later /sop-revoke X, her ack
 #   for X is invalidated. Bob's prior /sop-ack X is unaffected. If Alice
 #   posts /sop-revoke X then later /sop-ack X again, the ack is restored.
 from __future__ import annotations
 import argparse
 import json
 import os
 import re
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from typing import Any
 # ---------------------------------------------------------------------------
 # Slug normalization
 # ---------------------------------------------------------------------------
 _NORMALIZE_REPLACE_RE = re.compile(r"[\s_]+")
 _NORMALIZE_STRIP_RE = re.compile(r"[^a-z0-9-]")
 _NORMALIZE_DASH_RE = re.compile(r"-+")
 def normalize_slug(raw: str, numeric_aliases: dict[int, str] | None = None) -> str:
    """Normalize a user-supplied slug to canonical kebab-case form.
    See module header for the rules.
    If the input is a pure digit string AND numeric_aliases is provided,
    the alias mapping is consulted. Unknown digits return "" so the caller
    can flag the comment as unparseable.
    """
    if raw is None:
        return ""
    s = raw.strip().lower()
    s = _NORMALIZE_REPLACE_RE.sub("-", s)
    s = _NORMALIZE_STRIP_RE.sub("", s)
    s = _NORMALIZE_DASH_RE.sub("-", s)
    s = s.strip("-")
    if s.isdigit() and numeric_aliases is not None:
        return numeric_aliases.get(int(s), "")
    return s
 # ---------------------------------------------------------------------------
 # Comment parsing — /sop-ack and /sop-revoke
 # ---------------------------------------------------------------------------
 # A directive must be on its own line. Permits leading whitespace.
 # Optional trailing note after the slug for /sop-ack and required reason
 # for /sop-revoke (RFC#351 open question 4 — reason is captured but not
 # yet validated; future iteration may require a min-length).
 _DIRECTIVE_RE = re.compile(
    r"^[ \t]*/(sop-ack|sop-revoke)[ \t]+([A-Za-z0-9_\- ]+?)(?:[ \t]+(.*))?[ \t]*$",
    re.MULTILINE,
 )
 def parse_directives(
    comment_body: str,
    numeric_aliases: dict[int, str],
 ) -> list[tuple[str, str, str]]:
    """Extract /sop-ack and /sop-revoke directives from a comment body.
    Returns a list of (kind, canonical_slug, note) tuples where:
      kind is "sop-ack" or "sop-revoke"
      canonical_slug is the normalized form (or "" if unparseable)
      note is the trailing free-text (may be "")
    """
    out: list[tuple[str, str, str]] = []
    if not comment_body:
        return out
    for m in _DIRECTIVE_RE.finditer(comment_body):
        kind = m.group(1)
        raw_slug = (m.group(2) or "").strip()
        # If the raw match included trailing words, the regex non-greedy
        # captured only the first token; strip again for safety.
        # We split on whitespace to keep the FIRST word as the slug, and
        # everything after as the note.
        parts = raw_slug.split()
        if not parts:
            continue
        first = parts[0]
        # If the slug-capture greedily matched multiple words (e.g.
        # "comprehensive testing"), preserve normalize behavior: join
        # the WHOLE first-word-token only; trailing words get appended to
        # the note. The regex limits group(2) to [A-Za-z0-9_\- ] so we
        # may have multi-word forms here — normalize handles them.
        if len(parts) > 1:
            # User wrote "/sop-ack comprehensive testing extra-note"
            # → treat "comprehensive testing" as the slug source if it
            # normalizes to a known item; otherwise treat "comprehensive"
            # as slug and "testing extra-note" as note. We defer the
            # disambiguation to the caller via the returned canonical
            # slug. For simplicity: try the WHOLE captured string first.
            canonical = normalize_slug(raw_slug, numeric_aliases)
        else:
            canonical = normalize_slug(first, numeric_aliases)
        note_from_group = (m.group(3) or "").strip()
        # If we collapsed multi-word slug into kebab and there's a
        # trailing-text group too, append it.
        out.append((kind, canonical, note_from_group))
    return out
 # ---------------------------------------------------------------------------
 # PR body section detection
 # ---------------------------------------------------------------------------
 def section_marker_present(body: str, marker: str) -> bool:
    """Return True if `marker` appears in `body` case-insensitively
    on a non-empty line (i.e. the author actually filled it in).
    We require the marker substring AND non-whitespace content on the
    same line OR within the next line — this prevents trivially-empty
    checklists like:
        ## SOP-Checklist
        - [ ] **Comprehensive testing performed**:
        - [ ] **Local-postgres E2E run**:
    from auto-passing the section-present check. The peer-ack is still
    required, but answering with empty content is captured as a soft
    finding via the section-present test alone.
    """
    if not body or not marker:
        return False
    body_lower = body.lower()
    marker_lower = marker.lower()
    idx = body_lower.find(marker_lower)
    if idx < 0:
        return False
    # Walk to end of line.
    line_end = body.find("\n", idx)
    if line_end < 0:
        line_end = len(body)
    line = body[idx + len(marker):line_end]
    # Strip the colon + checkbox tail patterns; require at least one
    # non-whitespace, non-punctuation char.
    stripped = re.sub(r"[\s\*:\-\[\]]+", "", line)
    if stripped:
        return True
    # Fall through: check the NEXT line (multi-line answers).
    next_line_end = body.find("\n", line_end + 1)
    if next_line_end < 0:
        next_line_end = len(body)
    next_line = body[line_end + 1:next_line_end]
    stripped_next = re.sub(r"[\s\*:\-\[\]]+", "", next_line)
    return bool(stripped_next)
 # ---------------------------------------------------------------------------
 # Ack-state computation
 # ---------------------------------------------------------------------------
 def compute_ack_state(
    comments: list[dict[str, Any]],
    pr_author: str,
    items_by_slug: dict[str, dict[str, Any]],
    numeric_aliases: dict[int, str],
    team_membership_probe: "callable[[str, list[str]], list[str]]",
 ) -> dict[str, dict[str, Any]]:
    """Compute per-item ack state.
    Each comment is processed in chronological order. The most-recent
    directive per (commenter, slug) wins.
    Returns a dict keyed by canonical slug:
       {
         "comprehensive-testing": {
           "ackers": ["bob"],         # non-author, team-verified
           "rejected_ackers": {        # debugging info
             "self_ack": ["alice"],
             "unknown_slug": [],
             "not_in_team": ["eve"],
           }
         },
         ...
       }
    """
    # Step 1: collapse directives per (commenter, slug) — most recent wins.
    # comments are expected to come in chronological order from the
    # API (Gitea returns oldest-first by default for issues/{N}/comments).
    latest_directive: dict[tuple[str, str], str] = {}  # (user, slug) → kind
    unparseable_per_user: dict[str, int] = {}
    for c in comments:
        body = c.get("body", "") or ""
        user = (c.get("user") or {}).get("login", "")
        if not user:
            continue
        for kind, slug, _note in parse_directives(body, numeric_aliases):
            if not slug:
                unparseable_per_user[user] = unparseable_per_user.get(user, 0) + 1
                continue
            latest_directive[(user, slug)] = kind
    # Step 2: build candidate ackers per slug.
    # Filter out self-acks and unknown slugs.
    ackers_per_slug: dict[str, list[str]] = {s: [] for s in items_by_slug}
    rejected_self: dict[str, list[str]] = {s: [] for s in items_by_slug}
    rejected_unknown: dict[str, list[str]] = {s: [] for s in items_by_slug}
    pending_team_check: dict[str, list[str]] = {s: [] for s in items_by_slug}
    for (user, slug), kind in latest_directive.items():
        if kind != "sop-ack":
            continue  # revokes leave the (user,slug) state as "no ack"
        if slug not in items_by_slug:
            # Slug normalized to something not in our config — store
            # under a synthetic key for diagnostic surfacing. Don't add
            # to any item.
            continue
        if user == pr_author:
            rejected_self[slug].append(user)
            continue
        pending_team_check[slug].append(user)
    # Step 3: team membership probe per slug (batched per slug to keep
    # API call count down — same user may ack multiple items but the
    # required_teams differ per item, so we MUST probe per (user, item)).
    rejected_not_in_team: dict[str, list[str]] = {s: [] for s in items_by_slug}
    for slug, candidates in pending_team_check.items():
        if not candidates:
            continue
        required = items_by_slug[slug]["required_teams"]
        approved = team_membership_probe(slug, candidates)  # returns subset
        rejected_not_in_team[slug] = [u for u in candidates if u not in approved]
        ackers_per_slug[slug] = approved
        # Stash required teams for description rendering.
        items_by_slug[slug]["_required_resolved"] = required
    return {
        slug: {
            "ackers": ackers_per_slug[slug],
            "rejected": {
                "self_ack": rejected_self[slug],
                "not_in_team": rejected_not_in_team[slug],
            },
        }
        for slug in items_by_slug
    }
 # ---------------------------------------------------------------------------
 # Gitea API client
 # ---------------------------------------------------------------------------
 class GiteaClient:
    def __init__(self, host: str, token: str):
        self.base = f"https://{host}/api/v1"
        self.token = token
        # Cache team-name → team-id resolutions per org.
        self._team_id_cache: dict[tuple[str, str], int | None] = {}
    def _req(
        self,
        method: str,
        path: str,
        body: dict[str, Any] | None = None,
        ok_codes: tuple[int, ...] = (200, 201, 204),
    ) -> tuple[int, Any]:
        url = self.base + path
        data = None
        headers = {
            "Authorization": f"token {self.token}",
            "Accept": "application/json",
        }
        if body is not None:
            data = json.dumps(body).encode("utf-8")
            headers["Content-Type"] = "application/json"
        req = urllib.request.Request(url, method=method, data=data, headers=headers)
        try:
            with urllib.request.urlopen(req, timeout=20) as r:
                raw = r.read()
                code = r.getcode()
        except urllib.error.HTTPError as e:
            code = e.code
            raw = e.read()
        try:
            parsed = json.loads(raw.decode("utf-8")) if raw else None
        except json.JSONDecodeError:
            parsed = raw.decode("utf-8", errors="replace") if raw else None
        return code, parsed
    def get_pr(self, owner: str, repo: str, pr: int) -> dict[str, Any]:
        code, data = self._req("GET", f"/repos/{owner}/{repo}/pulls/{pr}")
        if code != 200:
            raise RuntimeError(f"GET pulls/{pr} → HTTP {code}: {data!r}")
        return data
    def get_issue_comments(
        self, owner: str, repo: str, issue: int
    ) -> list[dict[str, Any]]:
        # Paginate. Gitea default page size 50.
        out: list[dict[str, Any]] = []
        page = 1
        while True:
            code, data = self._req(
                "GET",
                f"/repos/{owner}/{repo}/issues/{issue}/comments?limit=50&page={page}",
            )
            if code != 200:
                raise RuntimeError(
                    f"GET issues/{issue}/comments page={page} → HTTP {code}: {data!r}"
                )
            if not data:
                break
            out.extend(data)
            if len(data) < 50:
                break
            page += 1
        return out
    def resolve_team_id(self, org: str, team_name: str) -> int | None:
        key = (org, team_name)
        if key in self._team_id_cache:
            return self._team_id_cache[key]
        code, data = self._req("GET", f"/orgs/{org}/teams/search?q={urllib.parse.quote(team_name)}")
        team_id = None
        if code == 200 and isinstance(data, dict):
            for t in data.get("data", []):
                if t.get("name") == team_name:
                    team_id = t.get("id")
                    break
        if team_id is None and code == 200 and isinstance(data, list):
            for t in data:
                if t.get("name") == team_name:
                    team_id = t.get("id")
                    break
        self._team_id_cache[key] = team_id
        return team_id
    def is_team_member(self, team_id: int, login: str) -> bool | None:
        """Return True / False / None (unknown — 403 from API)."""
        code, _ = self._req(
            "GET", f"/teams/{team_id}/members/{urllib.parse.quote(login)}"
        )
        if code in (200, 204):
            return True
        if code == 404:
            return False
        # 403 means the token owner isn't in this team, so the API
        # refuses to confirm membership. Fail-closed at the caller.
        return None
    def post_status(
        self,
        owner: str,
        repo: str,
        sha: str,
        state: str,
        context: str,
        description: str,
        target_url: str = "",
    ) -> None:
        body = {
            "state": state,
            "context": context,
            "description": description[:140],  # Gitea truncates to 255 but be safe
            "target_url": target_url or "",
        }
        code, data = self._req(
            "POST",
            f"/repos/{owner}/{repo}/statuses/{sha}",
            body=body,
            ok_codes=(201,),
        )
        if code not in (200, 201):
            raise RuntimeError(
                f"POST statuses/{sha} → HTTP {code}: {data!r}"
            )
 # ---------------------------------------------------------------------------
 # Config loader (PyYAML-free — config file is intentionally tiny + flat)
 # ---------------------------------------------------------------------------
 def load_config(path: str) -> dict[str, Any]:
    """Load .gitea/sop-checklist-config.yaml.
    Uses PyYAML if available, otherwise falls back to a built-in
    minimal parser sufficient for our flat config shape. Bundling
    PyYAML on the runner is one apt install away but we avoid the
    dep by keeping the config shape constrained.
    """
    try:
        import yaml  # type: ignore[import-not-found]
        with open(path) as f:
            return yaml.safe_load(f)
    except ImportError:
        return _load_config_minimal(path)
 def _load_config_minimal(path: str) -> dict[str, Any]:
    """Minimal YAML subset parser for our config shape.
    Supports: top-level scalar:value, top-level map-of-map (e.g.
    tier_failure_mode), top-level list of maps (items:), and within an
    item map: scalars + lists of scalars. Does NOT support nested lists,
    YAML anchors, multi-doc, or flow style.
    """
    with open(path) as f:
        lines = f.readlines()
    return _parse_minimal_yaml(lines)
 def _parse_minimal_yaml(lines: list[str]) -> dict[str, Any]:  # noqa: C901
    """Hand-rolled subset parser. See _load_config_minimal docstring."""
    # Strip comments + blank lines but preserve indentation.
    cleaned: list[tuple[int, str]] = []
    for raw in lines:
        # Don't strip a "#" that is inside a quoted value.
        body = raw.rstrip("\n")
        # Remove trailing comment.
        idx = body.find("#")
        if idx >= 0 and (idx == 0 or body[idx - 1] in " \t"):
            body = body[:idx].rstrip()
        if not body.strip():
            continue
        indent = len(body) - len(body.lstrip(" "))
        cleaned.append((indent, body.strip()))
    root: dict[str, Any] = {}
    i = 0
    n = len(cleaned)
    def parse_scalar(s: str) -> Any:
        s = s.strip()
        if s.startswith('"') and s.endswith('"'):
            return s[1:-1]
        if s.startswith("'") and s.endswith("'"):
            return s[1:-1]
        if s.lower() in ("true", "yes"):
            return True
        if s.lower() in ("false", "no"):
            return False
        try:
            return int(s)
        except ValueError:
            pass
        return s
    def parse_inline_list(s: str) -> list[Any]:
        s = s.strip()
        if not (s.startswith("[") and s.endswith("]")):
            return [parse_scalar(s)]
        inner = s[1:-1]
        if not inner.strip():
            return []
        return [parse_scalar(x.strip()) for x in inner.split(",")]
    while i < n:
        indent, line = cleaned[i]
        if indent != 0:
            i += 1
            continue
        if ":" not in line:
            i += 1
            continue
        key, _, rest = line.partition(":")
        key = key.strip()
        rest = rest.strip()
        if rest == "":
            # Block — could be map or list.
            i += 1
            # Look ahead for first child.
            if i < n and cleaned[i][1].startswith("- "):
                # List of items.
                items: list[Any] = []
                while i < n and cleaned[i][0] > indent and cleaned[i][1].startswith("- "):
                    item_indent = cleaned[i][0]
                    first_kv = cleaned[i][1][2:].strip()  # strip "- "
                    item: dict[str, Any] = {}
                    if ":" in first_kv:
                        k, _, v = first_kv.partition(":")
                        k = k.strip()
                        v = v.strip()
                        if v == "":
                            item[k] = ""
                        elif v.startswith(">-") or v.startswith(">"):
                            # Folded scalar continues on subsequent indented lines
                            collected: list[str] = []
                            i += 1
                            while i < n and cleaned[i][0] > item_indent:
                                collected.append(cleaned[i][1])
                                i += 1
                            item[k] = " ".join(collected)
                            items.append(item)
                            continue
                        elif v.startswith("["):
                            item[k] = parse_inline_list(v)
                        else:
                            item[k] = parse_scalar(v)
                    i += 1
                    # Subsequent k:v lines at deeper indent belong to this item.
                    while i < n and cleaned[i][0] > item_indent and not cleaned[i][1].startswith("- "):
                        sub_indent, sub_line = cleaned[i]
                        if ":" in sub_line:
                            k, _, v = sub_line.partition(":")
                            k = k.strip()
                            v = v.strip()
                            if v == "":
                                item[k] = ""
                                i += 1
                            elif v.startswith(">-") or v.startswith(">"):
                                collected = []
                                i += 1
                                while i < n and cleaned[i][0] > sub_indent:
                                    collected.append(cleaned[i][1])
                                    i += 1
                                item[k] = " ".join(collected)
                            elif v.startswith("["):
                                item[k] = parse_inline_list(v)
                                i += 1
                            else:
                                item[k] = parse_scalar(v)
                                i += 1
                        else:
                            i += 1
                    items.append(item)
                root[key] = items
            else:
                # Sub-map.
                submap: dict[str, Any] = {}
                while i < n and cleaned[i][0] > indent:
                    sub_indent, sub_line = cleaned[i]
                    if ":" in sub_line:
                        k, _, v = sub_line.partition(":")
                        k = k.strip().strip('"').strip("'")
                        v = v.strip()
                        if v.startswith("[") and v.endswith("]"):
                            submap[k] = parse_inline_list(v)
                        else:
                            submap[k] = parse_scalar(v)
                    i += 1
                root[key] = submap
        else:
            # Inline scalar or list.
            if rest.startswith("[") and rest.endswith("]"):
                root[key] = parse_inline_list(rest)
            else:
                root[key] = parse_scalar(rest)
            i += 1
    return root
 # ---------------------------------------------------------------------------
 # Main entry point
 # ---------------------------------------------------------------------------
 def render_status(
    items: list[dict[str, Any]],
    ack_state: dict[str, dict[str, Any]],
    body_state: dict[str, bool],
 ) -> tuple[str, str]:
    """Return (state, description) for the commit-status post.
    state is "success" if every item has at least one valid ack
    (body section presence is informational only — peer-ack is the
    real gate).  "pending" is reserved for the soft-fail path
    (tier:low) and is set by the caller.
    """
    n = len(items)
    fully_acked = [
        it["slug"] for it in items if ack_state[it["slug"]]["ackers"]
    ]
    missing = [
        it["slug"] for it in items if not ack_state[it["slug"]]["ackers"]
    ]
    missing_body = [it["slug"] for it in items if not body_state.get(it["slug"], False)]
    desc_parts = [f"acked: {len(fully_acked)}/{n}"]
    if missing:
        # Show up to 3 missing slugs to stay inside the 140-char budget.
        shown = ", ".join(missing[:3])
        if len(missing) > 3:
            shown += f", +{len(missing) - 3}"
        desc_parts.append(f"missing: {shown}")
    if missing_body:
        desc_parts.append(f"body-unfilled: {len(missing_body)}")
    state = "success" if not missing else "failure"
    return state, " — ".join(desc_parts)
 def get_tier_mode(pr: dict[str, Any], cfg: dict[str, Any]) -> str:
    """Read tier label, return 'hard' or 'soft' per cfg.tier_failure_mode."""
    labels = pr.get("labels") or []
    tier_labels = [l.get("name", "") for l in labels if (l.get("name", "") or "").startswith("tier:")]
    mode_map = cfg.get("tier_failure_mode") or {}
    default_mode = cfg.get("default_mode", "hard")
    for tl in tier_labels:
        if tl in mode_map:
            return mode_map[tl]
    return default_mode
 def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--owner", required=True)
    p.add_argument("--repo", required=True)
    p.add_argument("--pr", type=int, required=True)
    p.add_argument("--config", default=".gitea/sop-checklist-config.yaml")
    p.add_argument("--gitea-host", default="git.moleculesai.app")
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Compute state but do not POST the status.",
    )
    p.add_argument(
        "--status-context",
        default="sop-checklist / all-items-acked (pull_request)",
    )
    p.add_argument(
        "--exit-on-state",
        action="store_true",
        help=(
            "If set, exit non-zero when state=failure. Default OFF so the "
            "job-level conclusion is independent of ack-state — the only "
            "thing BP sees is the POSTed status. Useful for local debugging."
        ),
    )
    args = p.parse_args(argv)
    token = os.environ.get("GITEA_TOKEN", "")
    if not token and not args.dry_run:
        print("::error::GITEA_TOKEN env required", file=sys.stderr)
        return 2
    cfg = load_config(args.config)
    items: list[dict[str, Any]] = cfg["items"]
    items_by_slug = {it["slug"]: it for it in items}
    numeric_aliases = {
        int(it["numeric_alias"]): it["slug"] for it in items if it.get("numeric_alias")
    }
    client = GiteaClient(args.gitea_host, token) if token else None
    if not client:
        print("::error::No client (dry-run without token has nothing to do)", file=sys.stderr)
        return 2
    pr = client.get_pr(args.owner, args.repo, args.pr)
    if pr.get("state") != "open":
        print(f"::notice::PR #{args.pr} is {pr.get('state')} — gate is a no-op")
        return 0
    author = (pr.get("user") or {}).get("login", "")
    head_sha = (pr.get("head") or {}).get("sha", "")
    body = pr.get("body", "") or ""
    if not author or not head_sha:
        print("::error::PR payload missing user.login or head.sha", file=sys.stderr)
        return 1
    comments = client.get_issue_comments(args.owner, args.repo, args.pr)
    # Build team-membership probe closure that caches results per
    # (user, team-id) so a user acking multiple items only triggers
    # one membership lookup per team.
    team_member_cache: dict[tuple[str, int], bool | None] = {}
    def probe(slug: str, users: list[str]) -> list[str]:
        item = items_by_slug[slug]
        team_names: list[str] = item["required_teams"]
        # Resolve names → ids. NOTE: orgs/{org}/teams/search may not be
        # available — fall back to the list endpoint.
        team_ids: list[int] = []
        for tn in team_names:
            tid = client.resolve_team_id(args.owner, tn)
            if tid is None:
                # Try the list endpoint as a fallback.
                code, data = client._req(  # noqa: SLF001
                    "GET", f"/orgs/{args.owner}/teams"
                )
                if code == 200 and isinstance(data, list):
                    for t in data:
                        if t.get("name") == tn:
                            tid = t.get("id")
                            client._team_id_cache[(args.owner, tn)] = tid  # noqa: SLF001
                            break
            if tid is not None:
                team_ids.append(tid)
            else:
                print(
                    f"::warning::could not resolve team-id for '{tn}' "
                    f"in org '{args.owner}' — item '{slug}' will fail closed",
                    file=sys.stderr,
                )
        approved: list[str] = []
        for u in users:
            for tid in team_ids:
                cache_key = (u, tid)
                if cache_key not in team_member_cache:
                    team_member_cache[cache_key] = client.is_team_member(tid, u)
                result = team_member_cache[cache_key]
                if result is True:
                    approved.append(u)
                    break
                if result is None:
                    print(
                        f"::warning::team-probe for {u} in team-id {tid} returned 403 "
                        "(token owner not in that team — fail-closed per RFC#324)",
                        file=sys.stderr,
                    )
                    # Treat as not-in-team for this user/team pair; loop
                    # may still find membership in another team.
        return approved
    ack_state = compute_ack_state(comments, author, items_by_slug, numeric_aliases, probe)
    body_state = {it["slug"]: section_marker_present(body, it["pr_section_marker"]) for it in items}
    state, description = render_status(items, ack_state, body_state)
    mode = get_tier_mode(pr, cfg)
    if state == "failure" and mode == "soft":
        state = "pending"
        description = f"[soft-fail tier:low] {description}"
    # Diagnostics to job log.
    print(f"::notice::PR #{args.pr} author={author} head={head_sha[:7]} mode={mode}")
    for it in items:
        slug = it["slug"]
        ackers = ack_state[slug]["ackers"]
        if ackers:
            print(f"::notice::  [PASS] {slug} — acked by {','.join(ackers)}")
        else:
            r = ack_state[slug]["rejected"]
            extras: list[str] = []
            if r["self_ack"]:
                extras.append(f"self-acks-rejected:{','.join(r['self_ack'])}")
            if r["not_in_team"]:
                extras.append(f"not-in-team:{','.join(r['not_in_team'])}")
            extra = " (" + "; ".join(extras) + ")" if extras else ""
            print(f"::notice::  [WAIT] {slug} — no valid peer-ack yet{extra}")
    print(f"::notice::posting status: state={state} desc={description!r}")
    if args.dry_run:
        print("::notice::--dry-run: not posting status")
        if args.exit_on_state:
            return 0 if state in ("success", "pending") else 1
        return 0
    target_url = f"https://{args.gitea_host}/{args.owner}/{args.repo}/pulls/{args.pr}"
    client.post_status(
        args.owner, args.repo, head_sha,
        state=state, context=args.status_context,
        description=description, target_url=target_url,
    )
    print(f"::notice::status posted: {args.status_context} → {state}")
    # By default exit 0 — the POSTed status IS the gate, NOT the job
    # conclusion. If the job exits 1 BP will see TWO failure signals
    # (one from the job's auto-status, one from our POST), making the
    # description less actionable. --exit-on-state restores the old
    # behavior for local debugging.
    if args.exit_on_state:
        return 0 if state in ("success", "pending") else 1
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/sop-tier-check.sh
+++ b/.gitea/scripts/sop-tier-check.sh
@ -1,411 +0,0 @@
 #!/usr/bin/env bash
 # sop-tier-check — verify a Gitea PR satisfies the §SOP-6 approval gate.
 #
 # Reads the PR's tier label, walks approving reviewers, and checks team
 # membership against the tier's approval expression. Passes only when
 # ALL clauses in the expression are satisfied by the set of approving
 # reviewers (AND-composition; internal#189).
 #
 # Expression syntax:
 #   "team-a"          — OR-set: any ONE of the comma-separated teams
 #   "team-a AND team-b" — AND: BOTH must each have ≥1 approver
 #   "(a,b,c)"         — OR-set wrapped in parens; same as "a,b,c"
 #
 # Example: "qa AND security AND (managers,ceo)" means:
 #   ≥1 approver in team "qa"  AND
 #   ≥1 approver in team "security"  AND
 #   ≥1 approver in team "managers" OR "ceo"
 #
 # Per the spec (internal#189), the hard gate here pairs with the
 # advisory gate of sop-conformance LLM-judge (internal#188): each
 # required-team click must reflect real verification (visible in review
 # body or A2A messages), not rubber-stamp APPROVE. Both gates together
 # close the "teammate clicks APPROVE without verifying" gap.
 #
 # Invoked from `.gitea/workflows/sop-tier-check.yml`. The workflow sets
 # the env vars below; this script does no IO outside of stdout/stderr +
 # the Gitea API.
 #
 # Required env:
 #   GITEA_TOKEN   — bot PAT with read:organization,read:user,
 #                   read:issue,read:repository scopes
 #   GITEA_HOST    — e.g. git.moleculesai.app
 #   REPO          — owner/name (from github.repository)
 #   PR_NUMBER     — int (from github.event.pull_request.number)
 #   PR_AUTHOR     — login (from github.event.pull_request.user.login)
 #
 # Optional:
 #   SOP_DEBUG=1        — print per-API-call diagnostic lines. Default: off.
 #   SOP_LEGACY_CHECK=1 — revert to OR-gate (≥1 approver from any eligible
 #                         team). Grace window for PRs in-flight when the
 #                         new AND-composition was deployed. Expires 2026-05-17
 #                         (7-day burn-in window; internal#189 Phase 1).
 #                         Set by workflow for PRs merged before the deploy.
 set -euo pipefail
 # Ensure jq is available. Runners may not have it pre-installed, and the
 # workflow-level jq install can fail on runners with network restrictions
 # (GitHub releases not reachable from some runner networks — infra#241
 # follow-up). This fallback is idempotent — no-op when jq is already on PATH.
 # SOP_FAIL_OPEN=1 makes this always exit 0 so CI never blocks on jq absence.
 if ! command -v jq >/dev/null 2>&1; then
  echo "::notice::jq not found on PATH — attempting install..."
  _jq_installed="no"
  # apt-get first (primary) — Ubuntu package mirrors are reliably reachable.
  if apt-get update -qq && apt-get install -y -qq jq 2>/dev/null; then
    echo "::notice::jq installed via apt-get: $(jq --version)"
    _jq_installed="yes"
  # GitHub binary as secondary fallback — may fail on restricted networks.
  elif timeout 120 curl -sSL \
    "https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64" \
    -o /usr/local/bin/jq \
    && chmod +x /usr/local/bin/jq; then
    echo "::notice::jq binary downloaded: $(/usr/local/bin/jq --version)"
    _jq_installed="yes"
  fi
  if ! command -v jq >/dev/null 2>&1; then
    echo "::error::jq installation failed — apt-get and GitHub binary both failed."
    echo "::error::sop-tier-check requires jq for all JSON API parsing."
    # SOP_FAIL_OPEN=1 is set in the workflow step's env — makes script always
    # exit 0 so CI never blocks. The SOP-6 tier review gate remains enforced.
    if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
      echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
      exit 0
    fi
    exit 1
  fi
 fi
 debug() {
  if [ "${SOP_DEBUG:-}" = "1" ]; then
    echo "  [debug] $*" >&2
  fi
 }
 # Validate env
 : "${GITEA_TOKEN:?GITEA_TOKEN required}"
 : "${GITEA_HOST:?GITEA_HOST required}"
 : "${REPO:?REPO required (owner/name)}"
 : "${PR_NUMBER:?PR_NUMBER required}"
 : "${PR_AUTHOR:?PR_AUTHOR required}"
 OWNER="${REPO%%/*}"
 NAME="${REPO##*/}"
 API="https://${GITEA_HOST}/api/v1"
 AUTH="Authorization: token ${GITEA_TOKEN}"
 echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUTHOR"
 # Sanity: token resolves to a user.
 # Use || true on the jq pipeline so that set -euo pipefail (line 45) does not
 # cause the script to exit prematurely when the token is empty/invalid — the
 # if check below handles that case gracefully. Without || true, a 401 from an
 # empty/invalid token causes jq to exit 1, triggering set -e and exiting the
 # entire script before SOP_FAIL_OPEN can be evaluated (the check is in the jq-
 # install block; if jq is already on PATH, that block is skipped entirely).
 WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""') || true
 if [ -z "$WHOAMI" ]; then
  echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly."
  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
    exit 0
  fi
  exit 1
 fi
 echo "::notice::token resolves to user: $WHOAMI"
 # 1. Read tier label. || true ensures set -euo pipefail does not abort the
 # script if curl or jq fails (e.g. 401 from empty token).
 LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') || true
 TIER=""
 for L in $LABELS; do
  case "$L" in
    tier:low|tier:medium|tier:high)
      if [ -n "$TIER" ]; then
        echo "::error::Multiple tier labels: $TIER + $L. Apply exactly one."
        exit 1
      fi
      TIER="$L"
    ;;
  esac
 done
 if [ -z "$TIER" ]; then
  echo "::error::PR has no tier:low|tier:medium|tier:high label. Apply one before merge."
  exit 1
 fi
 debug "tier=$TIER"
 # 2. Tier → required team expression (AND-composition; internal#189)
 #
 # Expression syntax:
 #   clause-a AND clause-b AND ...   — ALL clauses must pass
 #   team-a,team-b,team-c            — OR-set: ≥1 approver in ANY of these teams
 #   (team-a,team-b)                 — same as team-a,team-b (parens optional)
 #
 # This map is the single source of truth. Update it when the team structure
 # or policy changes. Teams referenced here but absent in Gitea are treated
 # as unachievable (would always fail) — operators notice the clear error
 # and create the missing team.
 #
 # Current Gitea teams: ceo, engineers, managers
 # Future teams (create before removing "???" fallback): qa, security, security-audit
 declare -A TIER_EXPR=(
  # tier:low — same as previous OR gate: any engineer, manager, or ceo.
  ["tier:low"]="engineers,managers,ceo"
  # tier:medium — AND of (managers) AND (engineers) AND (qa???,security???)
  # The qa+security clause requires both teams to exist; when not yet
  # created, the PR author is responsible for adding them before requesting
  # approval on a tier:medium PR. Ops: create qa + security Gitea teams
  # and update this map to remove the "???" markers (internal#189 follow-up).
  ["tier:medium"]="managers AND engineers AND qa???,security???"
  # tier:high — ceo only. The AND-composition adds no value for a
  # single-team gate, but the framework is wired for consistency.
  ["tier:high"]="ceo"
 )
 EXPR="${TIER_EXPR[$TIER]-}"
 if [ -z "$EXPR" ]; then
  echo "::error::No expression defined for tier $TIER in TIER_EXPR map."
  exit 1
 fi
 debug "expression=$EXPR"
 # 3. Legacy OR-gate override (7-day burn-in grace window; internal#189 Phase 1)
 if [ "${SOP_LEGACY_CHECK:-}" = "1" ]; then
  LEGACY_ELIGIBLE=""
  case "$TIER" in
    tier:low)    LEGACY_ELIGIBLE="engineers managers ceo" ;;
    tier:medium) LEGACY_ELIGIBLE="managers ceo" ;;
    tier:high)   LEGACY_ELIGIBLE="ceo" ;;
  esac
  echo "::notice::SOP_LEGACY_CHECK=1 — using OR-gate ({$LEGACY_ELIGIBLE}) for this PR."
  ELIGIBLE="$LEGACY_ELIGIBLE"
 fi
 # 4. Resolve all team names → IDs
 # /orgs/{org}/teams/{slug}/... endpoints don't exist on Gitea 1.22;
 # we use /teams/{id}.
 # set +e prevents set -e from aborting the script if curl fails (e.g. empty token).
 ORG_TEAMS_FILE=$(mktemp)
 trap 'rm -f "$ORG_TEAMS_FILE"' EXIT
 set +e
 HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \
  "${API}/orgs/${OWNER}/teams")
 _HTTP_EXIT=$?
 set -e
 debug "teams-list HTTP=$HTTP_CODE (curl exit=$_HTTP_EXIT) size=$(wc -c <"$ORG_TEAMS_FILE")"
 if [ "${SOP_DEBUG:-}" = "1" ]; then
  echo "  [debug] teams-list body (first 300 chars):" >&2
  head -c 300 "$ORG_TEAMS_FILE" >&2; echo >&2
 fi
 if [ "$_HTTP_EXIT" -ne 0 ] || [ "$HTTP_CODE" != "200" ]; then
  echo "::error::GET /orgs/${OWNER}/teams failed (curl exit=$_HTTP_EXIT HTTP=$HTTP_CODE) — token may lack read:org scope or be invalid."
  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
    exit 0
  fi
  exit 1
 fi
 # Collect every team name that appears in the expression.
 # Bash word-splitting on $EXPR splits on spaces, so "AND" appears as a
 # token. We skip it explicitly.
 declare -A TEAM_ID
 _all_teams=""
 for _raw_clause in $EXPR; do
  # Strip parens and split on comma.
  _clause=${_raw_clause//[()]/}
  for _t in $(echo "$_clause" | tr ',' '\n'); do
    _t=$(echo "$_t" | tr -d '[:space:]')
    [ -z "$_t" ] && continue
    # Skip AND / OR operator tokens (bash word-split produced them from
    # spaces in the expression string).
    [ "$_t" = "AND" ] || [ "$_t" = "OR" ] && continue
    # Skip if already in set.
    case " $_all_teams " in
      *" $_t "*) ;;  # already present
      *) _all_teams="${_all_teams} $_t " ;;
    esac
  done
 done
 for _t in $_all_teams; do
  _t=$(echo "$_t" | tr -d ' ')
  [ -z "$_t" ] && continue
  _id=$(jq -r --arg t "$_t" '.[] | select(.name==$t) | .id' <"$ORG_TEAMS_FILE" | head -1)
  if [ -z "$_id" ] || [ "$_id" = "null" ]; then
    # "??" suffix marks teams that don't exist yet (tier:medium qa/security).
    # Treat as permanently failing clause; clear error message guides ops.
    if [[ "$_t" == *"???" ]]; then
      debug "team \"$_t\" not found (expected — pending team creation per internal#189)"
      continue
    fi
    _visible=$(jq -r '.[]?.name? // empty' <"$ORG_TEAMS_FILE" 2>/dev/null | tr '\n' ' ')
    echo "::error::Team \"$_t\" referenced in tier $TIER expression but not found in org $OWNER. Teams visible: $_visible"
    exit 1
  fi
  TEAM_ID[$_t]="$_id"
  debug "team-id: $_t → $_id"
 done
 # 5. Read approving reviewers. set +e disables set -e temporarily so that curl
 # failures (e.g. empty/invalid token → HTTP 401) do not abort the script before
 # SOP_FAIL_OPEN is evaluated. set -e is restored immediately after.
 set +e
 REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
 _REVIEWS_EXIT=$?
 set -e
 if [ $_REVIEWS_EXIT -ne 0 ] || [ -z "$REVIEWS" ]; then
  echo "::error::Failed to fetch reviews (curl exit=$_REVIEWS_EXIT) — token may be invalid or unreachable."
  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
    exit 0
  fi
  exit 1
 fi
 APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') || true
 if [ -z "$APPROVERS" ]; then
  echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics."
  exit 1
 fi
 debug "approvers: $(echo "$APPROVERS" | tr '\n' ' ')"
 # 6. For each approver: skip self-review; probe team membership by id.
 # Build $APPROVER_TEAMS[<user>]=space-surrounded team names (e.g. " managers ").
 # Pre/post spaces ensure case patterns *${_t}* match even when the name
 # is the first or last entry (bash case *word* needs delimiters on both sides).
 #
 # FALLBACK: if ALL team probes return 403 (token lacks read:org scope),
 # fall back to /orgs/{org}/members/{user}. This returns 204 for any org
 # member — a superset of team membership. Accepting it as a fallback means
 # the gate passes when the token is scoped to repo+user only (core-bot PAT).
 # This is safe because: (a) org membership is a prerequisite for every
 # eligible team; (b) the AND-composition of internal#189 still requires
 # multiple independent approvers; (c) any token with read:repository can
 # see the approving reviews, so bypass requires a colluding approver.
 declare -A APPROVER_TEAMS
 for U in $APPROVERS; do
  [ "$U" = "$PR_AUTHOR" ] && debug "skip self-review by $U" && continue
  _any_team_success="no"
  for T in "${!TEAM_ID[@]}"; do
    ID="${TEAM_ID[$T]}"
    CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
      "${API}/teams/${ID}/members/${U}")
    debug "probe: $U in team $T (id=$ID) → HTTP $CODE"
    if [ "$CODE" = "200" ] || [ "$CODE" = "204" ]; then
      APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
      debug "$U qualifies for team $T"
      _any_team_success="yes"
    fi
  done
  # Fallback: if every team probe returned 403, try org membership.
  # "??" teams were never resolved to IDs so they never entered the loop.
  # If the user is an org member, credit them as being in each queried team
  # (engineers, managers, ceo are all org-level). This is safe because org
  # membership is a prerequisite for all three, and bypass requires a colluding
  # approver (same risk as before the AND-composition).
  if [ "$_any_team_success" = "no" ]; then
    ORG_CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
      "${API}/orgs/${OWNER}/members/${U}")
    debug "probe: $U in org $OWNER (fallback) → HTTP $ORG_CODE"
    if [ "$ORG_CODE" = "204" ]; then
      for T in "${!TEAM_ID[@]}"; do
        APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
      done
      debug "$U credited as org member for all queried teams (fallback — token may lack read:org)"
    fi
  fi
 done
 # 7. Evaluate the tier expression.
 #
 # legacy OR-gate: use the simplified loop from before internal#189.
 if [ -n "${LEGACY_ELIGIBLE:-}" ]; then
  OK=""
  for _u in "${!APPROVER_TEAMS[@]}"; do
    for _t2 in $LEGACY_ELIGIBLE; do
      case "${APPROVER_TEAMS[$_u]}" in
        *${_t2}*)
          echo "::notice::approver $_u is in team $_t2 (eligible for $TIER)"
          OK="yes"
          break
        ;;
      esac
    done
    [ -n "$OK" ] && break
  done
  if [ -z "$OK" ]; then
    echo "::error::Tier $TIER requires approval from a non-author member of {$LEGACY_ELIGIBLE}. Set SOP_DEBUG=1 to see per-probe HTTP codes."
    exit 1
  fi
  echo "::notice::sop-tier-check passed: $TIER (legacy OR-gate)"
  exit 0
 fi
 # AND-gate: evaluate the expression clause by clause.
 # _passed_clauses and _failed_clauses accumulate for the status description.
 _passed_clauses=""
 _failed_clauses=""
 for _raw_clause in $EXPR; do
  # Normalise: strip parens, replace commas with spaces so bash word-split
  # can iterate the OR-set members. The previous form
  #   _clause=$(echo ... | tr ',' '\n' | tr -d '[:space:]' | grep -v '^$')
  # collapsed every member into one concatenated token because
  # `tr -d '[:space:]'` strips the very newlines that just separated them
  # ("engineers,managers,ceo" -> "engineersmanagersceo"), so the OR-clause
  # only ever evaluated as a single nonsense team name and never matched
  # APPROVER_TEAMS. Fixed in #229: leave the comma-separated members as
  # space-separated tokens for `for _t in $_clause`.
  _no_parens=${_raw_clause//[()]/}
  _clause=${_no_parens//,/ }
  _clause_passed="no"
  _clause_names=""
  for _t in $_clause; do
    # Append (don't overwrite) team name to the human-readable accumulator.
    # The previous form `_clause_names="${_clause_names:+, }${_t}"`
    # rewrote the variable on every iteration, so the FAIL message only
    # ever showed the LAST team. Fixed: prepend prior value before the
    # comma-separator, then append the new team name.
    _clause_names="${_clause_names}${_clause_names:+, }${_t}"
    # Skip teams not yet in Gitea (qa??? / security??? placeholders).
    [[ "$_t" == *"???" ]] && debug "clause \"$_t\": skipped (team pending creation)" && continue
    [ -z "${TEAM_ID[$_t]:-}" ] && debug "clause \"$_t\": no ID resolved, skipping" && continue
    for _u in "${!APPROVER_TEAMS[@]}"; do
      # Note: APPROVER_TEAMS values are space-surrounded (e.g. " managers ").
      # Pattern *${_t}* matches team name anywhere in the space-padded string.
      case "${APPROVER_TEAMS[$_u]}" in
        *${_t}*)
          _clause_passed="yes"
          debug "clause \"$_t\": satisfied by $_u"
          break
        ;;
      esac
    done
  done
  # Label for display: strip "???" from pending teams.
  _label=$(echo "$_raw_clause" | tr -d '()' | tr ',' '/' | tr -d '[:space:]' | sed 's/???//g')
  if [ "$_clause_passed" = "yes" ]; then
    # Append (don't overwrite) — same accumulator bug as _clause_names above.
    _passed_clauses="${_passed_clauses}${_passed_clauses:+, }$_label"
    echo "::notice::clause [$_label]: PASS — satisfied by approving reviewer(s)"
  else
    _failed_clauses="${_failed_clauses}${_failed_clauses:+, }$_label"
    echo "::error::clause [$_label]: FAIL — no approving reviewer belongs to any of these teams (${_clause_names}). Set SOP_DEBUG=1 to see per-team probe results."
  fi
 done
 if [ -n "$_failed_clauses" ]; then
  echo ""
  echo "::error::sop-tier-check FAILED for $TIER."
  echo "  Passed :${_passed_clauses}"
  echo "  Missing:${_failed_clauses}"
  echo "  All clauses must be satisfied. Each missing team needs an APPROVED review from one of its members."
  exit 1
 fi
 echo "::notice::sop-tier-check PASSED: $TIER — all required clauses satisfied [${_passed_clauses}]"
--- a/.gitea/scripts/sop-tier-refire.sh
+++ b/.gitea/scripts/sop-tier-refire.sh
@ -1,172 +0,0 @@
 #!/usr/bin/env bash
 # sop-tier-refire — re-evaluate sop-tier-check and POST status to PR head SHA.
 #
 # Invoked from `.gitea/workflows/sop-tier-refire.yml` when a repo
 # MEMBER/OWNER/COLLABORATOR comments `/refire-tier-check` on a PR.
 #
 # Behavior:
 #
 # 1. Resolve PR head SHA + author from PR_NUMBER.
 # 2. Rate-limit: if the sop-tier-check context has been POSTed in the
 #    last 30 seconds, skip (prevents comment-spam status thrash).
 # 3. Invoke `.gitea/scripts/sop-tier-check.sh` with the same env the
 #    canonical workflow provides. This is DRY: we re-use the exact AND-
 #    composition gate logic, not a watered-down approving-count check.
 # 4. POST the resulting status (success on exit 0, failure on non-zero)
 #    to `/repos/.../statuses/{HEAD_SHA}` with context
 #    "sop-tier-check / tier-check (pull_request)" — the same context name
 #    branch protection requires.
 #
 # Required env (set by sop-tier-refire.yml):
 #   GITEA_TOKEN    — org-level SOP_TIER_CHECK_TOKEN (read:org/user/issue/repo)
 #   GITEA_HOST     — e.g. git.moleculesai.app
 #   REPO           — owner/name
 #   PR_NUMBER      — PR number from issue_comment payload
 #   COMMENT_AUTHOR — login of the commenter (logged for audit)
 #
 # Optional:
 #   SOP_DEBUG=1                — verbose per-API-call diagnostics
 #   SOP_REFIRE_RATE_LIMIT_SEC  — override the 30s rate-limit (default 30)
 #   SOP_REFIRE_DISABLE_RATE_LIMIT=1 — for tests; skips the rate-limit check
 set -euo pipefail
 debug() {
  if [ "${SOP_DEBUG:-}" = "1" ]; then
    echo "  [debug] $*" >&2
  fi
 }
 : "${GITEA_TOKEN:?GITEA_TOKEN required}"
 : "${GITEA_HOST:?GITEA_HOST required}"
 : "${REPO:?REPO required (owner/name)}"
 : "${PR_NUMBER:?PR_NUMBER required}"
 : "${COMMENT_AUTHOR:=unknown}"
 OWNER="${REPO%%/*}"
 NAME="${REPO##*/}"
 API="https://${GITEA_HOST}/api/v1"
 AUTH="Authorization: token ${GITEA_TOKEN}"
 CONTEXT="sop-tier-check / tier-check (pull_request)"
 RATE_LIMIT_SEC="${SOP_REFIRE_RATE_LIMIT_SEC:-30}"
 echo "::notice::sop-tier-refire start: repo=$OWNER/$NAME pr=$PR_NUMBER commenter=$COMMENT_AUTHOR"
 # 1. Fetch PR details — need head.sha and user.login.
 PR_FILE=$(mktemp)
 trap 'rm -f "$PR_FILE"' EXIT
 PR_HTTP=$(curl -sS -o "$PR_FILE" -w '%{http_code}' -H "$AUTH" \
  "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
 if [ "$PR_HTTP" != "200" ]; then
  echo "::error::GET /pulls/$PR_NUMBER returned HTTP $PR_HTTP (body $(head -c 200 "$PR_FILE"))"
  exit 1
 fi
 HEAD_SHA=$(jq -r '.head.sha' <"$PR_FILE")
 PR_AUTHOR=$(jq -r '.user.login' <"$PR_FILE")
 PR_STATE=$(jq -r '.state' <"$PR_FILE")
 if [ -z "$HEAD_SHA" ] || [ "$HEAD_SHA" = "null" ]; then
  echo "::error::Could not resolve head.sha from PR #$PR_NUMBER response"
  exit 1
 fi
 debug "head_sha=$HEAD_SHA pr_author=$PR_AUTHOR state=$PR_STATE"
 if [ "$PR_STATE" != "open" ]; then
  echo "::notice::PR #$PR_NUMBER state is $PR_STATE; refire is a no-op on closed PRs."
  exit 0
 fi
 # 2. Rate-limit: skip if our context was updated in the last $RATE_LIMIT_SEC.
 # Gitea statuses endpoint returns latest first; we check the most recent
 # entry for our context name.
 if [ "${SOP_REFIRE_DISABLE_RATE_LIMIT:-}" != "1" ]; then
  STATUSES_FILE=$(mktemp)
  trap 'rm -f "$PR_FILE" "$STATUSES_FILE"' EXIT
  ST_HTTP=$(curl -sS -o "$STATUSES_FILE" -w '%{http_code}' -H "$AUTH" \
    "${API}/repos/${OWNER}/${NAME}/statuses/${HEAD_SHA}?limit=50&sort=newest")
  debug "statuses-list HTTP=$ST_HTTP"
  if [ "$ST_HTTP" = "200" ]; then
    LAST_UPDATED=$(jq -r --arg c "$CONTEXT" \
      '[.[] | select(.context == $c)] | first | .updated_at // ""' \
      <"$STATUSES_FILE")
    if [ -n "$LAST_UPDATED" ] && [ "$LAST_UPDATED" != "null" ]; then
      # Parse RFC3339 → epoch. Use python -c for portability (date(1) -d
      # differs between BSD/GNU; the Gitea runner is Ubuntu so GNU date
      # works, but we keep python for future container variance).
      LAST_EPOCH=$(python3 -c "import sys,datetime;print(int(datetime.datetime.fromisoformat(sys.argv[1].replace('Z','+00:00')).timestamp()))" "$LAST_UPDATED" 2>/dev/null || echo "0")
      NOW_EPOCH=$(date -u +%s)
      AGE=$((NOW_EPOCH - LAST_EPOCH))
      debug "last status update: $LAST_UPDATED ($AGE seconds ago)"
      if [ "$AGE" -lt "$RATE_LIMIT_SEC" ] && [ "$AGE" -ge 0 ]; then
        echo "::notice::sop-tier-refire rate-limited — last status update was ${AGE}s ago (<${RATE_LIMIT_SEC}s window). Try again shortly."
        exit 0
      fi
    fi
  fi
 fi
 # 3. Invoke sop-tier-check.sh with the env it expects. Capture exit code.
 # The canonical script reads tier label, walks approving reviewers, and
 # evaluates the AND-composition expression — we want the SAME gate, not
 # a different gate.
 #
 # SOP_REFIRE_TIER_CHECK_SCRIPT env var lets tests substitute a mock —
 # sop-tier-check.sh uses bash 4+ associative arrays which trigger a known
 # bash 3.2 parser bug (`tier: unbound variable` from declare -A with
 # `set -u`). Linux Gitea runners ship bash 4/5 so production is fine;
 # the override exists so the bash 3.2 dev box can still exercise the
 # refire glue logic end-to-end.
 SCRIPT="${SOP_REFIRE_TIER_CHECK_SCRIPT:-$(dirname "$0")/sop-tier-check.sh}"
 if [ ! -f "$SCRIPT" ]; then
  echo "::error::sop-tier-check.sh not found at $SCRIPT — refire requires the canonical script"
  exit 1
 fi
 # Re-invoke. Pipe stdout/stderr through so the runner log shows the
 # tier-check decision inline.
 set +e
 GITEA_TOKEN="$GITEA_TOKEN" \
  GITEA_HOST="$GITEA_HOST" \
  REPO="$REPO" \
  PR_NUMBER="$PR_NUMBER" \
  PR_AUTHOR="$PR_AUTHOR" \
  SOP_DEBUG="${SOP_DEBUG:-0}" \
  SOP_LEGACY_CHECK="${SOP_LEGACY_CHECK:-0}" \
  bash "$SCRIPT"
 TIER_EXIT=$?
 set -e
 debug "sop-tier-check.sh exit=$TIER_EXIT"
 # 4. POST the resulting status.
 if [ "$TIER_EXIT" -eq 0 ]; then
  STATE="success"
  DESCRIPTION="Refired via /refire-tier-check by $COMMENT_AUTHOR"
 else
  STATE="failure"
  DESCRIPTION="Refired via /refire-tier-check; tier-check failed (see workflow log)"
 fi
 # Status target_url points at the runner log so a curious reviewer can
 # follow it back. SERVER_URL + RUN_ID + JOB_ID isn't trivially constructible
 # from the bash env on Gitea 1.22.6, so we point at the PR itself.
 TARGET_URL="https://${GITEA_HOST}/${OWNER}/${NAME}/pulls/${PR_NUMBER}"
 POST_BODY=$(jq -nc \
  --arg state "$STATE" \
  --arg context "$CONTEXT" \
  --arg description "$DESCRIPTION" \
  --arg target_url "$TARGET_URL" \
  '{state:$state, context:$context, description:$description, target_url:$target_url}')
 POST_FILE=$(mktemp)
 trap 'rm -f "$PR_FILE" "${STATUSES_FILE:-}" "$POST_FILE"' EXIT
 POST_HTTP=$(curl -sS -o "$POST_FILE" -w '%{http_code}' \
  -X POST -H "$AUTH" -H "Content-Type: application/json" \
  -d "$POST_BODY" \
  "${API}/repos/${OWNER}/${NAME}/statuses/${HEAD_SHA}")
 if [ "$POST_HTTP" != "200" ] && [ "$POST_HTTP" != "201" ]; then
  echo "::error::POST /statuses/$HEAD_SHA returned HTTP $POST_HTTP (body $(head -c 200 "$POST_FILE"))"
  exit 1
 fi
 echo "::notice::sop-tier-refire posted state=$STATE for context=\"$CONTEXT\" on sha=$HEAD_SHA"
 exit "$TIER_EXIT"
--- a/.gitea/scripts/status-reaper.py
+++ b/.gitea/scripts/status-reaper.py
@ -1,699 +0,0 @@
 #!/usr/bin/env python3
 """status-reaper — Option B compensating-status POST for Gitea 1.22.6's
 hardcoded `(push)` suffix on default-branch commit statuses.
 Tracking: this PR (workflow + script + tests + audit issue). Sibling
 bots: internal#327 (publish-runtime-bot), internal#328 (mc-drift-bot).
 Upstream RFC: internal#80. Persona provisioned by sub-agent aefaac1b
 (2026-05-11 21:39Z; Gitea uid 94, scope=write:repository).
 What this script does, per `.gitea/workflows/status-reaper.yml` invocation:
  1. Walk `.gitea/workflows/*.yml`. For each file, build the workflow_id
     using this resolution (per hongming-pc 22:08Z review):
       - If YAML has top-level `name:` → use that.
       - Else → use filename stem (basename minus `.yml`).
     Fail-LOUD on:
       - Two workflows resolving to the SAME identifier (collision).
       - Any identifier containing `/` (it would break context parsing
         downstream — Gitea uses ` / ` as the workflow/job separator).
     Classify each by whether `on:` contains a `push:` trigger.
  2. List the last N (=30, rev3 — widened from 10) commits on
     WATCH_BRANCH via GET /repos/{o}/{r}/commits?sha={branch}&limit={N}.
     rev2 sweeps N commits per tick instead of HEAD only — schedule
     workflows post `failure` to whatever SHA was HEAD when they
     COMPLETED, so by the next */5 tick main has often moved forward
     and the red gets stranded on a stale commit. rev3 widens the
     window from 10 → 30 because schedule workflows post `failure`
     RETROACTIVELY (5-15 min after their merge); a 10-commit window
     is narrower than the merge-cadence during a burst, so reds land
     OUTSIDE the window before reaper sees them (Phase 1+2 evidence:
     rev2 run 17057 at 02:46Z saw 185/0 contexts on 10 SHAs; direct
     probe ~30min later showed ~25 fails on those same 10 SHAs).
  3. For EACH SHA in the list:
       - GET combined commit status. Per-SHA error isolation
         (refinement #7): if this call raises ApiError or any 5xx,
         LOG `::warning::` + continue to the next SHA. Different from
         the single-HEAD pre-rev2 path where fail-loud was correct;
         the sweep is best-effort across historical commits, so one
         transient blip on a stale SHA must not strand reds on the
         OTHER stale SHAs.
       - If combined.state == "success": skip — cost optimization
         (refinement #2), common case (most commits are green).
       - Otherwise iterate per-context entries. For each entry where:
           state == "failure" AND context.endswith(" (push)")
         Parse context as `<workflow_name> / <job_name> (push)`.
         Look up workflow_name in the trigger map:
           - missing → log ::notice:: and skip (conservative).
           - has_push_trigger=True → preserve (real defect signal).
           - has_push_trigger=False → POST a compensating
             `state=success` status to /statuses/{sha} with the same
             context (Gitea de-dups by context) and a description
             documenting the workaround + this script's path.
  4. Exit 0. Re-running is idempotent — Gitea's commit-status table
     stores the LATEST state-per-context, so the success POST sticks
     even if another tick happens before the runner finishes.
 What it does NOT do:
  - Touch any context NOT ending in ` (push)`. The required-checks on
    main (verified 2026-05-11) all have ` (pull_request)` suffixes;
    they CANNOT be reached by this code path.
  - Compensate `error`/`pending` states. Only `failure` — the only one
    Gitea emits for the hardcoded-suffix bug.
  - Write to non-default branches. WATCH_BRANCH is sourced from
    `github.event.repository.default_branch` in the workflow.
  - Mutate workflows or runs. The Actions UI still shows the
    underlying schedule-triggered run as failed; this script edits
    the commit-status surface only.
 Halt conditions (script-level — orchestrator-level halts are in the
 workflow comments):
  - PyYAML missing → fail-loud at import (no fallback parse).
  - Workflow `name:` collision → exit 1 with ::error:: message.
  - Workflow `name:` containing `/` → exit 1 with ::error:: message.
  - Ambiguous `on:` shape (e.g. neither str/list/dict) → treat as
    "has_push_trigger=True" and log ::notice:: (preserve, never
    compensate the unknown).
  - api() non-2xx → raise ApiError, fail the workflow run loudly so
    a subsequent tick retries (per
    `feedback_api_helper_must_raise_not_return_dict`).
 Local dry-run (no network):
    GITEA_TOKEN=... GITEA_HOST=git.moleculesai.app REPO=owner/repo \\
      WATCH_BRANCH=main WORKFLOWS_DIR=.gitea/workflows \\
      python3 .gitea/scripts/status-reaper.py --dry-run
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from pathlib import Path
 from typing import Any
 import yaml  # PyYAML 6.0.2 — installed by the workflow before this runs.
 # --------------------------------------------------------------------------
 # Environment
 # --------------------------------------------------------------------------
 def _env(key: str, *, default: str = "") -> str:
    """Read an env var with a default. Module-import-safe — tests can
    import this script without setting the full env contract."""
    return os.environ.get(key, default)
 GITEA_TOKEN = _env("GITEA_TOKEN")
 GITEA_HOST = _env("GITEA_HOST")
 REPO = _env("REPO")
 WATCH_BRANCH = _env("WATCH_BRANCH", default="main")
 WORKFLOWS_DIR = _env("WORKFLOWS_DIR", default=".gitea/workflows")
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
 # Compensating-status description prefix. Used as the marker so a human
 # auditing commit statuses can tell at a glance that the green was
 # synthetic, not a real CI pass. Kept stable; downstream tooling
 # (e.g. main-red-watchdog visual diff) MAY key on it.
 COMPENSATION_DESCRIPTION = (
    "Compensated by status-reaper (workflow has no push: trigger; "
    "Gitea 1.22.6 hardcoded-suffix bug — see .gitea/scripts/status-reaper.py)"
 )
 # Context suffix the reaper acts on. Gitea hardcodes this for ALL
 # default-branch workflow runs.
 PUSH_SUFFIX = " (push)"
 def _require_runtime_env() -> None:
    """Enforce env contract — called from `main()` only.
    Tests import individual functions without setting the full env
    contract. Mirrors `main-red-watchdog.py`/`ci-required-drift.py`.
    """
    for key in ("GITEA_TOKEN", "GITEA_HOST", "REPO", "WATCH_BRANCH", "WORKFLOWS_DIR"):
        if not os.environ.get(key):
            sys.stderr.write(f"::error::missing required env var: {key}\n")
            sys.exit(2)
 # --------------------------------------------------------------------------
 # Tiny HTTP helper — raises on non-2xx + on JSON-decode-of-expected-JSON.
 # --------------------------------------------------------------------------
 class ApiError(RuntimeError):
    """Raised when a Gitea API call cannot be trusted to have succeeded.
    Per `feedback_api_helper_must_raise_not_return_dict`: soft-failure is
    opt-in via `expect_json=False`, never the default. A pre-fix
    implementation that returned `{}` on non-2xx would skip the
    compensating POST on a transient outage AND silently lose the
    failed-status enumeration, painting main green via omission.
    """
 def api(
    method: str,
    path: str,
    *,
    body: dict | None = None,
    query: dict[str, str] | None = None,
    expect_json: bool = True,
 ) -> tuple[int, Any]:
    """Tiny HTTP helper around urllib. Same contract as
    `main-red-watchdog.py` and `ci-required-drift.py` so behaviour
    is cross-checkable."""
    url = f"{API}{path}"
    if query:
        url = f"{url}?{urllib.parse.urlencode(query)}"
    data = None
    headers = {
        "Authorization": f"token {GITEA_TOKEN}",
        "Accept": "application/json",
    }
    if body is not None:
        data = json.dumps(body).encode("utf-8")
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, method=method, data=data, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = resp.read()
            status = resp.status
    except urllib.error.HTTPError as e:
        raw = e.read()
        status = e.code
    if not (200 <= status < 300):
        snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
        raise ApiError(f"{method} {path} -> HTTP {status}: {snippet}")
    if not raw:
        return status, None
    try:
        return status, json.loads(raw)
    except json.JSONDecodeError as e:
        if expect_json:
            raise ApiError(
                f"{method} {path} -> HTTP {status} but body is not JSON: {e}"
            ) from e
        return status, {"_raw": raw.decode("utf-8", errors="replace")}
 # --------------------------------------------------------------------------
 # Workflow scan + classification
 # --------------------------------------------------------------------------
 def _on_block(doc: dict) -> Any:
    """Extract the `on:` block from a parsed YAML doc.
    PyYAML parses bareword `on:` as Python `True` (YAML 1.1 boolean
    spec — `on/off/yes/no` are booleans). The actual key in the dict
    is therefore `True`, NOT the string `"on"`. We accept both for
    forward-compat with YAML 1.2 loaders (which keep it as `"on"`).
    """
    if True in doc:
        return doc[True]
    return doc.get("on")
 def _has_push_trigger(on_block: Any, workflow_id: str) -> bool:
    """Return True if `on:` block declares a `push` trigger.
    Accepts the three common shapes:
      - str: `on: push` → True only if == "push"
      - list: `on: [push, pull_request]` → True if "push" in list
      - dict: `on: { push: {...}, schedule: ... }` → True if "push" key
    Defensive: for anything else (including None/empty), return True
    so we preserve rather than over-compensate. Logged via ::notice::.
    """
    if isinstance(on_block, str):
        return on_block == "push"
    if isinstance(on_block, list):
        return "push" in on_block
    if isinstance(on_block, dict):
        return "push" in on_block
    # None or unexpected shape — preserve, log.
    print(
        f"::notice::ambiguous on: for {workflow_id}; preserving "
        f"(value={on_block!r}, type={type(on_block).__name__})"
    )
    return True
 def scan_workflows(workflows_dir: str) -> dict[str, bool]:
    """Walk `workflows_dir` and return `{workflow_id: has_push_trigger}`.
    Workflow ID resolution (per hongming-pc 22:08Z review):
      - Top-level `name:` if present.
      - Else filename stem (basename minus `.yml`).
    Fail-LOUD on:
      - Two workflows resolving to the same ID (collision).
      - Any ID containing `/` (would break ` / `-separated context
        parsing on the downstream side).
    Returns a dict for O(1) lookup in the per-status loop.
    """
    path = Path(workflows_dir)
    if not path.is_dir():
        # Workflow dir missing → no workflows to classify. Empty map is
        # safe: per-status loop will hit "unknown workflow; skip" for
        # every entry, which is correct (we cannot tell if a push
        # trigger exists, so we preserve).
        print(f"::warning::workflows dir not found: {workflows_dir}")
        return {}
    out: dict[str, bool] = {}
    sources: dict[str, str] = {}  # workflow_id -> source file (for collision msg)
    for yml in sorted(path.glob("*.yml")):
        try:
            with yml.open() as f:
                doc = yaml.safe_load(f)
        except yaml.YAMLError as e:
            # A malformed YAML in the workflows dir is a real defect
            # (the workflow wouldn't load on Gitea either). Surface it
            # and keep going — the reaper's job is to compensate the
            # OTHER workflows even if one is broken.
            print(f"::warning::yaml parse failed for {yml.name}: {e}; skip")
            continue
        if not isinstance(doc, dict):
            print(f"::warning::workflow {yml.name} not a dict; skip")
            continue
        # Resolve workflow_id.
        name_field = doc.get("name")
        if isinstance(name_field, str) and name_field.strip():
            workflow_id = name_field.strip()
        else:
            workflow_id = yml.stem  # basename minus .yml
        # Halt-loud: `/` in workflow_id breaks ` / ` context parsing.
        if "/" in workflow_id:
            sys.stderr.write(
                f"::error::workflow name contains '/' which breaks "
                f"context parsing: {workflow_id} (file={yml.name})\n"
            )
            sys.exit(1)
        # Halt-loud: ID collision.
        if workflow_id in out:
            sys.stderr.write(
                f"::error::workflow name collision detected: {workflow_id} "
                f"(files: {sources[workflow_id]} + {yml.name})\n"
            )
            sys.exit(1)
        on_block = _on_block(doc)
        out[workflow_id] = _has_push_trigger(on_block, workflow_id)
        sources[workflow_id] = yml.name
    return out
 # --------------------------------------------------------------------------
 # Gitea reads
 # --------------------------------------------------------------------------
 def get_head_sha(branch: str) -> str:
    """HEAD SHA of `branch`. Raises ApiError on non-2xx."""
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}")
    if not isinstance(body, dict):
        raise ApiError(f"branch {branch} response not a JSON object")
    commit = body.get("commit")
    if not isinstance(commit, dict):
        raise ApiError(f"branch {branch} response missing `commit` object")
    sha = commit.get("id") or commit.get("sha")
    if not isinstance(sha, str) or len(sha) < 7:
        raise ApiError(f"branch {branch} response has no usable commit SHA")
    return sha
 def get_combined_status(sha: str) -> dict:
    """Combined commit status for `sha`. Gitea returns:
        {
          "state": "success" | "failure" | "pending" | "error",
          "statuses": [
            {"context": "...", "state": "...", "target_url": "...",
             "description": "..."},
            ...
          ],
          ...
        }
    Raises ApiError on non-2xx.
    """
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(body, dict):
        raise ApiError(f"status for {sha} response not a JSON object")
    return body
 # --------------------------------------------------------------------------
 # Context parsing
 # --------------------------------------------------------------------------
 def parse_push_context(context: str) -> tuple[str, str] | None:
    """Parse `<workflow_name> / <job_name> (push)` into
    (workflow_name, job_name).
    Returns None if the context doesn't match the shape (caller skips).
    Strict: requires the trailing ` (push)` and at least one ` / `
    separator. Anything else is left alone.
    """
    if not context.endswith(PUSH_SUFFIX):
        return None
    head = context[: -len(PUSH_SUFFIX)]  # strip " (push)"
    if " / " not in head:
        # No workflow/job separator — not the bug shape we compensate.
        return None
    workflow_name, job_name = head.split(" / ", 1)
    return workflow_name, job_name
 # --------------------------------------------------------------------------
 # Compensating POST
 # --------------------------------------------------------------------------
 def post_compensating_status(
    sha: str,
    context: str,
    target_url: str | None,
    *,
    dry_run: bool = False,
 ) -> None:
    """POST a `state=success` to /repos/{o}/{r}/statuses/{sha} with the
    given context. Gitea de-dups by context (latest write wins).
    Description references this script so the compensation is
    self-documenting on the commit's status view.
    """
    payload: dict[str, Any] = {
        "context": context,
        "state": "success",
        "description": COMPENSATION_DESCRIPTION,
    }
    # Echo the original target_url when present so a human auditing
    # the (now-green) compensated status can still reach the run logs
    # that produced the original red.
    if target_url:
        payload["target_url"] = target_url
    if dry_run:
        print(
            f"::notice::[dry-run] would compensate {context!r} on {sha[:10]} "
            f"with state=success"
        )
        return
    api("POST", f"/repos/{OWNER}/{NAME}/statuses/{sha}", body=payload)
    print(f"::notice::compensated {context!r} on {sha[:10]} (state=success)")
 # --------------------------------------------------------------------------
 # Main reap loop
 # --------------------------------------------------------------------------
 def reap(
    workflow_trigger_map: dict[str, bool],
    combined: dict,
    sha: str,
    *,
    dry_run: bool = False,
 ) -> dict[str, Any]:
    """Walk `combined.statuses[]` and compensate where appropriate.
    Per-SHA worker. The multi-SHA orchestrator (`reap_branch`) calls
    this once per stale main commit each tick.
    Returns counters for observability:
      {compensated, preserved_real_push, preserved_unknown,
       preserved_non_failure, preserved_non_push_suffix,
       preserved_unparseable,
       compensated_contexts: [<context>, ...]}
    `compensated_contexts` is rev2-added so `reap_branch` can build
    `compensated_per_sha` without re-deriving it from the POST stream.
    """
    counters: dict[str, Any] = {
        "compensated": 0,
        "preserved_real_push": 0,
        "preserved_unknown": 0,
        "preserved_non_failure": 0,
        "preserved_non_push_suffix": 0,
        "preserved_unparseable": 0,
        "compensated_contexts": [],
    }
    statuses = combined.get("statuses") or []
    for s in statuses:
        if not isinstance(s, dict):
            continue
        context = s.get("context") or ""
        # Schema asymmetry: Gitea 1.22.6 returns the TOP-LEVEL combined
        # aggregate as `combined.state` but each per-context entry in
        # `combined.statuses[]` uses the key `status`, NOT `state`.
        # Prefer `status`; fall back to `state` so a future Gitea
        # version (or a test fixture written against the wrong key)
        # still flows through the compensation path. Verified empirically
        # via direct API probe 2026-05-12 03:42Z:
        #   /repos/.../commits/{sha}/status entries → key is "status".
        # Pre-rev4 code read "state" only → returned "" → bypassed the
        # `state != "failure"` guard → compensation path unreachable.
        # See `feedback_smoke_test_vendor_truth_not_shape_match`.
        state = s.get("status") or s.get("state") or ""
        # Only `failure` is the bug shape. `error`/`pending`/`success`
        # left alone — they have other meanings.
        if state != "failure":
            counters["preserved_non_failure"] += 1
            continue
        # Only `(push)`-suffix contexts hit the hardcoded-suffix bug.
        # Branch-protection required checks (e.g. `Secret scan / Scan
        # diff (pull_request)`) are NOT reachable from this path.
        if not context.endswith(PUSH_SUFFIX):
            counters["preserved_non_push_suffix"] += 1
            continue
        parsed = parse_push_context(context)
        if parsed is None:
            # Has ` (push)` suffix but missing ` / ` separator — not
            # the bug shape. Preserve.
            counters["preserved_unparseable"] += 1
            continue
        workflow_name, _job_name = parsed
        if workflow_name not in workflow_trigger_map:
            # Real workflow but renamed/deleted/external — we can't
            # tell if it has push trigger. Conservative: preserve.
            print(f"::notice::unknown workflow {workflow_name!r}; skip")
            counters["preserved_unknown"] += 1
            continue
        if workflow_trigger_map[workflow_name]:
            # Real push trigger → real defect signal. Preserve.
            counters["preserved_real_push"] += 1
            continue
        # Class-O: schedule/dispatch/etc.-only workflow with a fake
        # (push) status from Gitea's hardcoded-suffix bug. Compensate.
        post_compensating_status(
            sha, context, s.get("target_url"), dry_run=dry_run
        )
        counters["compensated"] += 1
        counters["compensated_contexts"].append(context)
    return counters
 # --------------------------------------------------------------------------
 # rev2: multi-SHA sweep over the last N commits on WATCH_BRANCH
 # --------------------------------------------------------------------------
 # How many main commits to sweep per tick. Sized to cover a burst-merge
 # window where multiple PRs land in the 5-min interval between reaper
 # ticks. Older reds falling off the window is acceptable — they were
 # already stale enough that the schedule-run that posted them has long
 # since been overwritten by a real push trigger. See `reference_post_
 # suspension_pipeline` for the merge-cadence baseline.
 #
 # rev3 (2026-05-12, hongming-pc2 GO 03:25Z): widened from 10 → 30.
 # rev2 (limit=10) shipped 01:48Z and ran 6/6 ticks post-merge with
 # `compensated:0` despite ~25 stranded reds visible on those same 10
 # SHAs ~30min later. Root cause: schedule workflows post `failure`
 # RETROACTIVELY 5-15 min after their merge, so by the time reaper's
 # next */5 tick lands, the stranded red is on a SHA that has already
 # fallen out of a 10-commit window during a burst-merge period.
 # Trades window-width-cheap for cadence-loady (per hongming-pc2):
 # kept `*/5` cron unchanged; only the window-N is widened.
 DEFAULT_SWEEP_LIMIT = 30
 def list_recent_commit_shas(branch: str, limit: int) -> list[str]:
    """List the most recent `limit` commit SHAs on `branch`, newest
    first.
    Wraps GET /repos/{o}/{r}/commits?sha={branch}&limit={limit}. Gitea
    1.22.6 returns a JSON list of commit objects each with a `sha` key
    (verified via vendor-truth probe 2026-05-11 against
    git.moleculesai.app — `feedback_smoke_test_vendor_truth_not_shape_match`).
    Raises ApiError on non-2xx OR on unexpected response shape. This is
    a HARD halt — without the commit list the sweep can't proceed. (The
    per-SHA error isolation downstream is a different concern: tolerating
    a transient 5xx on ONE commit's status is best-effort; losing the
    commit list itself means we don't even know which commits to try.)
    """
    _, body = api(
        "GET",
        f"/repos/{OWNER}/{NAME}/commits",
        query={"sha": branch, "limit": str(limit)},
    )
    if not isinstance(body, list):
        raise ApiError(
            f"commits listing for {branch} not a JSON array "
            f"(got {type(body).__name__})"
        )
    shas: list[str] = []
    for entry in body:
        if not isinstance(entry, dict):
            continue
        sha = entry.get("sha")
        if isinstance(sha, str) and len(sha) >= 7:
            shas.append(sha)
    if not shas:
        raise ApiError(
            f"commits listing for {branch} returned no usable SHAs"
        )
    return shas
 def reap_branch(
    workflow_trigger_map: dict[str, bool],
    branch: str,
    *,
    limit: int = DEFAULT_SWEEP_LIMIT,
    dry_run: bool = False,
 ) -> dict[str, Any]:
    """Sweep the last `limit` commits on `branch`, applying `reap()`
    to each (with per-SHA error isolation).
    Returns aggregated counters PLUS rev2 observability fields:
      - scanned_shas: how many SHAs we actually iterated
      - compensated_per_sha: {<sha_full>: [<context>, ...]} — only
        SHAs that actually got at least one compensation are included
    """
    shas = list_recent_commit_shas(branch, limit)
    aggregate: dict[str, Any] = {
        "scanned_shas": 0,
        "compensated": 0,
        "preserved_real_push": 0,
        "preserved_unknown": 0,
        "preserved_non_failure": 0,
        "preserved_non_push_suffix": 0,
        "preserved_unparseable": 0,
        "compensated_per_sha": {},
    }
    for sha in shas:
        aggregate["scanned_shas"] += 1
        # Per-SHA error isolation (refinement #7). One transient blip
        # on a historical commit must NOT abort the whole tick — the
        # OTHER stale SHAs may still hold strandable reds.
        try:
            combined = get_combined_status(sha)
        except ApiError as e:
            print(
                f"::warning::get_combined_status({sha[:10]}) failed; "
                f"skipping this SHA: {e}"
            )
            continue
        # Cost optimization (refinement #2): the common case is a green
        # commit. Skip the per-context loop entirely when combined is
        # already success — saves a tight loop over ~20 statuses per SHA
        # on green commits, the dominant majority.
        if combined.get("state") == "success":
            continue
        per_sha = reap(
            workflow_trigger_map, combined, sha, dry_run=dry_run
        )
        # Aggregate scalar counters.
        for key in (
            "compensated",
            "preserved_real_push",
            "preserved_unknown",
            "preserved_non_failure",
            "preserved_non_push_suffix",
            "preserved_unparseable",
        ):
            aggregate[key] += per_sha[key]
        # Record per-SHA compensated contexts (only when non-empty —
        # keep the summary readable when most SHAs are no-ops).
        contexts = per_sha.get("compensated_contexts") or []
        if contexts:
            aggregate["compensated_per_sha"][sha] = list(contexts)
    return aggregate
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Skip the compensating POST; print what would be done.",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=DEFAULT_SWEEP_LIMIT,
        help=(
            "How many recent commits on WATCH_BRANCH to sweep per tick "
            f"(default: {DEFAULT_SWEEP_LIMIT})."
        ),
    )
    args = parser.parse_args()
    _require_runtime_env()
    workflow_trigger_map = scan_workflows(WORKFLOWS_DIR)
    print(
        f"::notice::scanned {len(workflow_trigger_map)} workflows; "
        f"push-triggered={sum(1 for v in workflow_trigger_map.values() if v)}, "
        f"class-O candidates={sum(1 for v in workflow_trigger_map.values() if not v)}"
    )
    counters = reap_branch(
        workflow_trigger_map,
        WATCH_BRANCH,
        limit=args.limit,
        dry_run=args.dry_run,
    )
    # Observability: print one JSON line summarising the tick. Loki
    # ingestion via the runner's stdout (`source="gitea-actions"`).
    print(
        "status-reaper summary: "
        + json.dumps(
            {
                "branch": WATCH_BRANCH,
                "dry_run": args.dry_run,
                "limit": args.limit,
                **counters,
            },
            sort_keys=True,
        )
    )
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/tests/_mock_tier_check.sh
+++ b/.gitea/scripts/tests/_mock_tier_check.sh
@ -1,28 +0,0 @@
 #!/usr/bin/env bash
 # Mock sop-tier-check.sh for sop-tier-refire tests.
 #
 # Exits 0 ("PASS") if $MOCK_TIER_RESULT == "pass", else exits 1.
 # This lets the refire tests cover the success + failure status-POST
 # paths without invoking the real sop-tier-check.sh (which uses bash 4+
 # associative arrays — known parser bug on macOS bash 3.2 dev box).
 set -euo pipefail
 case "${MOCK_TIER_RESULT:-pass}" in
  pass)
    echo "::notice::mock tier-check: PASS"
    exit 0
    ;;
  fail_no_label)
    echo "::error::mock tier-check: no tier label"
    exit 1
    ;;
  fail_no_approvals)
    echo "::error::mock tier-check: no approving reviews"
    exit 1
    ;;
  *)
    echo "::error::mock tier-check: unknown MOCK_TIER_RESULT=${MOCK_TIER_RESULT:-}"
    exit 2
    ;;
 esac
--- a/.gitea/scripts/tests/_refire_fixture.py
+++ b/.gitea/scripts/tests/_refire_fixture.py
@ -1,208 +0,0 @@
 #!/usr/bin/env python3
 """Stub Gitea API for sop-tier-refire test scenarios.
 Reads $FIXTURE_STATE_DIR/scenario to decide what to return for each
 endpoint the sop-tier-refire.sh + sop-tier-check.sh scripts call.
 Captures every POST to /statuses/{sha} into posted_statuses.jsonl so
 the test can assert what the script tried to write.
 Scenarios:
  T1_success         — tier:low + APPROVED by engineer → tier-check passes
  T2_no_tier_label   — no tier label → tier-check exits 1 before POST
  T3_no_approvals    — tier:low but zero approving reviews → exits 1
  T4_closed          — PR state=closed → refire is a no-op
  T5_rate_limited    — last status update 5 seconds ago → skip
 Usage:
  FIXTURE_STATE_DIR=/tmp/x python3 _refire_fixture.py 8080
 """
 import datetime
 import http.server
 import json
 import os
 import re
 import sys
 import urllib.parse
 STATE_DIR = os.environ["FIXTURE_STATE_DIR"]
 def scenario() -> str:
    p = os.path.join(STATE_DIR, "scenario")
    if not os.path.isfile(p):
        return "T1_success"
    with open(p) as f:
        return f.read().strip()
 def now_iso() -> str:
    return datetime.datetime.now(datetime.timezone.utc).isoformat()
 def append_post(body: dict) -> None:
    with open(os.path.join(STATE_DIR, "posted_statuses.jsonl"), "a") as f:
        f.write(json.dumps(body) + "\n")
 def pr_payload() -> dict:
    sc = scenario()
    state = "closed" if sc == "T4_closed" else "open"
    return {
        "number": 999,
        "state": state,
        "head": {"sha": "deadbeef0000111122223333444455556666"},
        "user": {"login": "feature-author"},
    }
 def labels_payload() -> list:
    sc = scenario()
    if sc == "T2_no_tier_label":
        return [{"name": "bug"}]
    # All other scenarios use tier:low
    return [{"name": "tier:low"}, {"name": "ci"}]
 def reviews_payload() -> list:
    sc = scenario()
    if sc == "T3_no_approvals":
        return []
    # All other scenarios have one APPROVED review by an engineer
    return [
        {
            "state": "APPROVED",
            "user": {"login": "reviewer-engineer"},
        }
    ]
 def teams_payload() -> list:
    # Mirror the real molecule-ai org teams referenced in TIER_EXPR
    return [
        {"id": 5, "name": "ceo"},
        {"id": 2, "name": "engineers"},
        {"id": 6, "name": "managers"},
    ]
 def statuses_payload() -> list:
    sc = scenario()
    if sc == "T5_rate_limited":
        recent = (
            datetime.datetime.now(datetime.timezone.utc)
            - datetime.timedelta(seconds=5)
        ).isoformat()
        return [
            {
                "context": "sop-tier-check / tier-check (pull_request)",
                "state": "failure",
                "updated_at": recent,
            }
        ]
    return []
 def user_payload() -> dict:
    # Mirrors the WHOAMI probe in sop-tier-check.sh
    return {"login": "sop-tier-bot-fixture"}
 class Handler(http.server.BaseHTTPRequestHandler):
    # Quiet — keep stdout for explicit logs only.
    def log_message(self, *args, **kwargs):  # noqa: D401
        pass
    def _json(self, code: int, body) -> None:
        payload = json.dumps(body).encode()
        self.send_response(code)
        self.send_header("Content-Type", "application/json")
        self.send_header("Content-Length", str(len(payload)))
        self.end_headers()
        self.wfile.write(payload)
    def _empty(self, code: int) -> None:
        self.send_response(code)
        self.send_header("Content-Length", "0")
        self.end_headers()
    def do_GET(self):  # noqa: N802
        u = urllib.parse.urlparse(self.path)
        path = u.path
        if path == "/_ping":
            return self._json(200, {"ok": True})
        if path == "/api/v1/user":
            return self._json(200, user_payload())
        # /api/v1/repos/{owner}/{name}/pulls/{n}
        m = re.match(r"^/api/v1/repos/[^/]+/[^/]+/pulls/(\d+)$", path)
        if m:
            return self._json(200, pr_payload())
        # /api/v1/repos/{owner}/{name}/issues/{n}/labels
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/issues/\d+/labels$", path):
            return self._json(200, labels_payload())
        # /api/v1/repos/{owner}/{name}/pulls/{n}/reviews
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/pulls/\d+/reviews$", path):
            return self._json(200, reviews_payload())
        # /api/v1/orgs/{owner}/teams
        if re.match(r"^/api/v1/orgs/[^/]+/teams$", path):
            return self._json(200, teams_payload())
        # /api/v1/teams/{id}/members/{login} → 204 if user is an engineer
        m = re.match(r"^/api/v1/teams/(\d+)/members/([^/]+)$", path)
        if m:
            team_id, login = m.group(1), m.group(2)
            # In our fixture reviewer-engineer ∈ engineers (id=2)
            if team_id == "2" and login == "reviewer-engineer":
                return self._empty(204)
            return self._empty(404)
        # /api/v1/orgs/{owner}/members/{login} — fallback path used when
        # team-member probes all 403. We don't need it for these tests.
        if re.match(r"^/api/v1/orgs/[^/]+/members/[^/]+$", path):
            return self._empty(404)
        # /api/v1/repos/{owner}/{name}/statuses/{sha}
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/statuses/[^/]+$", path):
            return self._json(200, statuses_payload())
        return self._json(404, {"path": path, "msg": "fixture: no route"})
    def do_POST(self):  # noqa: N802
        u = urllib.parse.urlparse(self.path)
        path = u.path
        length = int(self.headers.get("Content-Length") or 0)
        raw = self.rfile.read(length) if length else b""
        try:
            body = json.loads(raw) if raw else {}
        except Exception:
            body = {"_raw": raw.decode(errors="replace")}
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/statuses/[^/]+$", path):
            append_post(body)
            # Echo back something status-shaped — script only checks HTTP code.
            return self._json(
                201,
                {
                    "context": body.get("context"),
                    "state": body.get("state"),
                    "created_at": now_iso(),
                },
            )
        return self._json(404, {"path": path, "msg": "fixture: no route"})
 def main():
    port = int(sys.argv[1])
    srv = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
    srv.serve_forever()
 if __name__ == "__main__":
    main()
--- a/.gitea/scripts/tests/_review_check_fixture.py
+++ b/.gitea/scripts/tests/_review_check_fixture.py
@ -1,140 +0,0 @@
 #!/usr/bin/env python3
 """Stub Gitea API for review-check.sh test scenarios.
 Reads $FIXTURE_STATE_DIR/scenario to decide what to return for each
 endpoint the review-check.sh script calls.
 Reads $FIXTURE_STATE_DIR/token_owner_in_teams to decide whether
 the team membership probe returns 200/204 (member) or 403 (not in team).
 Scenarios:
  T1_pr_open          — open PR, author=alice, sha=deadbeef → continue
  T2_pr_closed        — closed PR → script exits 0 (no-op)
  T3_reviews_approved_non_author  — one APPROVED from non-author → candidates exist
  T4_reviews_empty             — zero APPROVED non-author → exit 1 (no candidates)
  T5_reviews_only_author        — only author reviews → exit 1 (no candidates)
  T6_reviews_dismissed          — dismissed APPROVED → treated as no approval
  T7_team_member              — team membership → 204 (member) → exit 0
  T8_team_not_member          — team membership → 404 (not a member) → exit 1
  T9_team_403                — team membership → 403 (token not in team) → exit 1
 Usage:
  FIXTURE_STATE_DIR=/tmp/x python3 _review_check_fixture.py 8080
 """
 import http.server
 import json
 import os
 import re
 import sys
 import urllib.parse
 STATE_DIR = os.environ.get("FIXTURE_STATE_DIR", "/tmp")
 def scenario() -> str:
    p = os.path.join(STATE_DIR, "scenario")
    if not os.path.isfile(p):
        return "T1_pr_open"
    with open(p) as f:
        return f.read().strip()
 class Handler(http.server.BaseHTTPRequestHandler):
    def log_message(self, *args, **kwargs):
        pass  # keep stdout for explicit logs only
    def _json(self, code: int, body: dict) -> None:
        payload = json.dumps(body).encode()
        self.send_response(code)
        self.send_header("Content-Type", "application/json")
        self.send_header("Content-Length", str(len(payload)))
        self.end_headers()
        self.wfile.write(payload)
    def _empty(self, code: int) -> None:
        self.send_response(code)
        self.send_header("Content-Length", "0")
        self.end_headers()
    def _text(self, code: int, body: str) -> None:
        payload = body.encode()
        self.send_response(code)
        self.send_header("Content-Type", "text/plain")
        self.send_header("Content-Length", str(len(payload)))
        self.end_headers()
        self.wfile.write(payload)
    def do_GET(self):
        u = urllib.parse.urlparse(self.path)
        path = u.path
        sc = scenario()
        if path == "/_ping":
            return self._json(200, {"ok": True})
        # GET /repos/{owner}/{name}/pulls/{pr_number}
        m = re.match(r"^/api/v1/repos/([^/]+)/([^/]+)/pulls/(\d+)$", path)
        if m:
            owner, name, pr_num = m.group(1), m.group(2), m.group(3)
            if sc == "T2_pr_closed":
                return self._json(200, {
                    "number": int(pr_num),
                    "state": "closed",
                    "head": {"sha": "deadbeef0000111122223333444455556666"},
                    "user": {"login": "alice"},
                })
            return self._json(200, {
                "number": int(pr_num),
                "state": "open",
                "head": {"sha": "deadbeef0000111122223333444455556666"},
                "user": {"login": "alice"},
            })
        # GET /repos/{owner}/{name}/pulls/{pr_number}/reviews
        m = re.match(r"^/api/v1/repos/([^/]+)/([^/]+)/pulls/(\d+)/reviews$", path)
        if m:
            if sc in ("T4_reviews_empty", "T5_reviews_only_author"):
                return self._json(200, [])
            if sc == "T6_reviews_dismissed":
                return self._json(200, [{
                    "state": "APPROVED",
                    "dismissed": True,
                    "user": {"login": "core-devops"},
                    "commit_id": "abc1234",
                }])
            if sc == "T3_reviews_approved_non_author":
                return self._json(200, [
                    {"state": "CHANGES_REQUESTED", "dismissed": False, "user": {"login": "bob"}, "commit_id": "abc1234"},
                    {"state": "APPROVED", "dismissed": False, "user": {"login": "core-devops"}, "commit_id": "abc1234"},
                ])
            # Default: one non-author APPROVED
            return self._json(200, [
                {"state": "APPROVED", "dismissed": False, "user": {"login": "core-devops"}, "commit_id": "abc1234"},
            ])
        # GET /teams/{team_id}/members/{username}
        m = re.match(r"^/api/v1/teams/(\d+)/members/([^/]+)$", path)
        if m:
            team_id, login = m.group(1), m.group(2)
            if sc == "T8_team_not_member":
                return self._empty(404)
            if sc == "T9_team_403":
                return self._empty(403)
            # T7_team_member: member
            return self._empty(204)
        return self._json(404, {"path": path, "msg": "fixture: no route"})
    def do_POST(self):
        self._json(404, {"path": self.path, "msg": "fixture: no POST routes"})
 def main():
    port = int(sys.argv[1])
    srv = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
    srv.serve_forever()
 if __name__ == "__main__":
    main()
--- a/.gitea/scripts/tests/test_lint_pre_flip_continue_on_error.py
+++ b/.gitea/scripts/tests/test_lint_pre_flip_continue_on_error.py
@ -1,505 +0,0 @@
 """Unit tests for .gitea/scripts/lint_pre_flip_continue_on_error.py.
 These tests pin the pure-logic surface (flip detection + per-flip
 verdict aggregation) without making real HTTP calls. The end-to-end
 git ls-tree + Gitea API path is exercised by running the workflow
 against real PRs.
 Run locally::
    python3 -m unittest .gitea/scripts/tests/test_lint_pre_flip_continue_on_error.py -v
 Mirrors the pattern in scripts/ops/test_check_migration_collisions.py
 + scripts/test_build_runtime_package.py.
 """
 from __future__ import annotations
 import importlib.util
 import os
 import sys
 import unittest
 from pathlib import Path
 from unittest import mock
 # Load the script as a module without invoking main(). Tests must NOT
 # depend on the full runtime env contract (GITEA_TOKEN etc.), so we
 # import individual functions and stub the network surface explicitly.
 SCRIPT_PATH = Path(__file__).resolve().parent.parent / "lint_pre_flip_continue_on_error.py"
 spec = importlib.util.spec_from_file_location("lpfc", SCRIPT_PATH)
 lpfc = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(lpfc)
 # --------------------------------------------------------------------------
 # Fixtures: minimal valid workflow YAML on each side of a "diff"
 # --------------------------------------------------------------------------
 CI_YML_BASE = """\
 name: CI
 on:
  push:
    branches: [main]
 jobs:
  platform-build:
    name: Platform (Go)
    runs-on: ubuntu-latest
    continue-on-error: true
    steps:
      - run: echo platform
  canvas-build:
    name: Canvas (Next.js)
    runs-on: ubuntu-latest
    continue-on-error: true
    steps:
      - run: echo canvas
  all-required:
    runs-on: ubuntu-latest
    continue-on-error: true
    needs: [platform-build, canvas-build]
    steps:
      - run: echo ok
 """
 CI_YML_HEAD_FLIPPED = """\
 name: CI
 on:
  push:
    branches: [main]
 jobs:
  platform-build:
    name: Platform (Go)
    runs-on: ubuntu-latest
    continue-on-error: false
    steps:
      - run: echo platform
  canvas-build:
    name: Canvas (Next.js)
    runs-on: ubuntu-latest
    continue-on-error: false
    steps:
      - run: echo canvas
  all-required:
    runs-on: ubuntu-latest
    continue-on-error: true
    needs: [platform-build, canvas-build]
    steps:
      - run: echo ok
 """
 CI_YML_HEAD_NO_DIFF = CI_YML_BASE  # identical to base, no flip
 # --------------------------------------------------------------------------
 # 1. CoE coercion (truthy/falsy/quoted/absent)
 # --------------------------------------------------------------------------
 class TestCoerceCoE(unittest.TestCase):
    def test_python_bool_true(self):
        self.assertTrue(lpfc._coerce_coe(True))
    def test_python_bool_false(self):
        self.assertFalse(lpfc._coerce_coe(False))
    def test_none_is_false(self):
        # GitHub Actions default: absent == false.
        self.assertFalse(lpfc._coerce_coe(None))
    def test_string_true_lowercase(self):
        # Quoted "true" in YAML — Gitea Actions normalizes to True.
        self.assertTrue(lpfc._coerce_coe("true"))
    def test_string_True_titlecase(self):
        self.assertTrue(lpfc._coerce_coe("True"))
    def test_string_yes(self):
        # YAML 1.1 truthy form.
        self.assertTrue(lpfc._coerce_coe("yes"))
    def test_string_false(self):
        self.assertFalse(lpfc._coerce_coe("false"))
    def test_string_random_falsy(self):
        # An unrecognized string is treated as falsy — safer than
        # silently coercing "maybe" to True and false-positiving a
        # flip.
        self.assertFalse(lpfc._coerce_coe("maybe"))
 # --------------------------------------------------------------------------
 # 2. Diff detection — flips, not arbitrary changes
 # --------------------------------------------------------------------------
 class TestDetectFlips(unittest.TestCase):
    def test_no_flip_in_diff_passes(self):
        # Acceptance test #1: PR doesn't flip continue-on-error → 0 flips.
        flips = lpfc.detect_flips(
            {".gitea/workflows/ci.yml": CI_YML_BASE},
            {".gitea/workflows/ci.yml": CI_YML_HEAD_NO_DIFF},
        )
        self.assertEqual(flips, [])
    def test_flip_detected_in_one_file(self):
        flips = lpfc.detect_flips(
            {".gitea/workflows/ci.yml": CI_YML_BASE},
            {".gitea/workflows/ci.yml": CI_YML_HEAD_FLIPPED},
        )
        # Two jobs flipped: platform-build, canvas-build. all-required
        # is still true on both sides.
        self.assertEqual(len(flips), 2)
        keys = sorted(f["job_key"] for f in flips)
        self.assertEqual(keys, ["canvas-build", "platform-build"])
    def test_context_name_render(self):
        flips = lpfc.detect_flips(
            {".gitea/workflows/ci.yml": CI_YML_BASE},
            {".gitea/workflows/ci.yml": CI_YML_HEAD_FLIPPED},
        )
        platform = next(f for f in flips if f["job_key"] == "platform-build")
        self.assertEqual(platform["context"], "CI / Platform (Go) (push)")
        self.assertEqual(platform["workflow_name"], "CI")
    def test_context_falls_back_to_job_key_when_no_name(self):
        base = "name: WF\njobs:\n  foo:\n    continue-on-error: true\n    runs-on: x\n    steps: []\n"
        head = "name: WF\njobs:\n  foo:\n    continue-on-error: false\n    runs-on: x\n    steps: []\n"
        flips = lpfc.detect_flips({"a.yml": base}, {"a.yml": head})
        self.assertEqual(len(flips), 1)
        self.assertEqual(flips[0]["context"], "WF / foo (push)")
    def test_no_flip_when_only_one_side_has_file(self):
        # Newly added workflow file — head has CoE:false, base has no
        # file. Adding a new workflow with CoE:false is fine; there's
        # nothing to mask.
        flips = lpfc.detect_flips(
            {},  # base has no workflow files
            {".gitea/workflows/new.yml": CI_YML_HEAD_FLIPPED},
        )
        self.assertEqual(flips, [])
    def test_no_flip_when_job_removed(self):
        # Job exists on base, not on head — a removal, not a flip.
        head = """\
 name: CI
 jobs:
  canvas-build:
    name: Canvas (Next.js)
    continue-on-error: true
    runs-on: ubuntu-latest
    steps: []
 """
        flips = lpfc.detect_flips(
            {".gitea/workflows/ci.yml": CI_YML_BASE},
            {".gitea/workflows/ci.yml": head},
        )
        self.assertEqual(flips, [])
    def test_no_flip_when_job_added_with_false(self):
        # New job on head with CoE:false — no base side; not a flip.
        head_with_new = CI_YML_BASE.replace(
            "  all-required:",
            "  newjob:\n    name: New Job\n    continue-on-error: false\n"
            "    runs-on: x\n    steps: []\n"
            "  all-required:",
        )
        flips = lpfc.detect_flips(
            {".gitea/workflows/ci.yml": CI_YML_BASE},
            {".gitea/workflows/ci.yml": head_with_new},
        )
        self.assertEqual(flips, [])
    def test_yaml_parse_error_warns_not_raises(self):
        # Malformed YAML on head — should warn (stderr) and skip,
        # not raise.
        bad_head = "name: CI\njobs:\n  :::\n"
        # Capture stderr so the test isn't noisy.
        with mock.patch.object(sys, "stderr"):
            flips = lpfc.detect_flips(
                {".gitea/workflows/ci.yml": CI_YML_BASE},
                {".gitea/workflows/ci.yml": bad_head},
            )
        self.assertEqual(flips, [])
 # --------------------------------------------------------------------------
 # 3. grep_fail_markers — the regex / substring matcher
 # --------------------------------------------------------------------------
 class TestGrepFailMarkers(unittest.TestCase):
    def test_clean_log_returns_empty(self):
        log = "===== test run starting =====\nPASS\nok  example.com/foo  1.234s\n"
        self.assertEqual(lpfc.grep_fail_markers(log), [])
    def test_go_minus_minus_minus_fail_caught(self):
        log = "ok  example.com/foo  1.234s\n--- FAIL: TestBar (0.01s)\n    bar_test.go:42:\n"
        matches = lpfc.grep_fail_markers(log)
        self.assertEqual(len(matches), 1)
        self.assertIn("FAIL: TestBar", matches[0])
    def test_go_package_fail_caught(self):
        log = "FAIL\texample.com/baz\t1.234s\n"
        matches = lpfc.grep_fail_markers(log)
        self.assertEqual(len(matches), 1)
        self.assertIn("FAIL", matches[0])
    def test_bash_error_directive_caught(self):
        # `lint-curl-status-capture` pattern: a python heredoc inside a
        # bash step that prints `::error::` then sys.exit(1). With
        # continue-on-error:true the job rolls up as success despite
        # this line. THAT's the masking we're trying to catch.
        log = "Running scan...\n::error::Found 3 curl-status-capture pollution site(s):\n"
        matches = lpfc.grep_fail_markers(log)
        self.assertEqual(len(matches), 1)
        self.assertIn("::error::", matches[0])
    def test_caps_matches_at_max_5(self):
        log = "\n".join(["--- FAIL: T%d" % i for i in range(20)])
        matches = lpfc.grep_fail_markers(log)
        self.assertEqual(len(matches), 5)
 # --------------------------------------------------------------------------
 # 4. verify_flip — single-flip verdict assembly (network surface stubbed)
 # --------------------------------------------------------------------------
 def _stub_status(context: str, state: str, target_url: str = "/owner/repo/actions/runs/1/jobs/0") -> dict:
    """Build a single-context combined-status response."""
    return {
        "state": state,
        "statuses": [
            {"context": context, "status": state, "target_url": target_url, "description": ""}
        ],
    }
 FLIP_FIXTURE = {
    "workflow_path": ".gitea/workflows/ci.yml",
    "workflow_name": "CI",
    "job_key": "platform-build",
    "job_name": "Platform (Go)",
    "context": "CI / Platform (Go) (push)",
 }
 class TestVerifyFlip(unittest.TestCase):
    def test_flip_with_clean_history_passes(self):
        # Acceptance test #2: flip detected, last 5 runs clean → exit 0.
        with mock.patch.object(lpfc, "recent_commits_on_branch", return_value=["sha1", "sha2", "sha3"]):
            with mock.patch.object(
                lpfc, "combined_status",
                side_effect=[_stub_status(FLIP_FIXTURE["context"], "success") for _ in range(3)],
            ):
                with mock.patch.object(lpfc, "fetch_log", return_value="ok  example.com/foo  1s\nPASS\n"):
                    verdict = lpfc.verify_flip(FLIP_FIXTURE, "main", 5)
        self.assertEqual(verdict["fail_runs"], [])
        self.assertEqual(verdict["masked_runs"], [])
        self.assertEqual(verdict["checked_commits"], 3)
        self.assertEqual(verdict["warnings"], [])
    def test_flip_with_recent_fail_blocks(self):
        # Acceptance test #3: flip detected, recent run has --- FAIL → exit 1.
        # Setup: 3 commits, the most recent run's log shows --- FAIL
        # but the STATUS is success (Quirk #10 mask). That's the
        # masked_runs case.
        log_with_fail = "ok  example.com/foo  1s\n--- FAIL: TestSqlmock (0.01s)\n    sqlmock_test.go:42:\n"
        with mock.patch.object(lpfc, "recent_commits_on_branch", return_value=["sha1", "sha2", "sha3"]):
            with mock.patch.object(
                lpfc, "combined_status",
                side_effect=[_stub_status(FLIP_FIXTURE["context"], "success") for _ in range(3)],
            ):
                with mock.patch.object(lpfc, "fetch_log", side_effect=[log_with_fail, "PASS\n", "PASS\n"]):
                    verdict = lpfc.verify_flip(FLIP_FIXTURE, "main", 5)
        self.assertEqual(len(verdict["masked_runs"]), 1)
        self.assertEqual(verdict["masked_runs"][0]["sha"], "sha1")
        self.assertTrue(any("TestSqlmock" in s for s in verdict["masked_runs"][0]["samples"]))
        self.assertEqual(verdict["fail_runs"], [])
    def test_red_status_alone_blocks(self):
        # Status itself is `failure` — block without needing log
        # markers. (Belt-and-braces: even with a clean log, a `failure`
        # status means the job's exit code was non-zero.)
        with mock.patch.object(lpfc, "recent_commits_on_branch", return_value=["sha1"]):
            with mock.patch.object(
                lpfc, "combined_status",
                return_value=_stub_status(FLIP_FIXTURE["context"], "failure"),
            ):
                with mock.patch.object(lpfc, "fetch_log", return_value="some unrelated text\n"):
                    verdict = lpfc.verify_flip(FLIP_FIXTURE, "main", 5)
        self.assertEqual(len(verdict["fail_runs"]), 1)
        self.assertEqual(verdict["fail_runs"][0]["status"], "failure")
    def test_unreadable_log_warns_not_blocks(self):
        # Acceptance test #5: log fetch 404 (None) → warn, not block.
        # Status is `success`, log is None — we can't tell, so we warn
        # and allow.
        with mock.patch.object(lpfc, "recent_commits_on_branch", return_value=["sha1"]):
            with mock.patch.object(
                lpfc, "combined_status",
                return_value=_stub_status(FLIP_FIXTURE["context"], "success"),
            ):
                with mock.patch.object(lpfc, "fetch_log", return_value=None):
                    verdict = lpfc.verify_flip(FLIP_FIXTURE, "main", 5)
        self.assertEqual(verdict["fail_runs"], [])
        self.assertEqual(verdict["masked_runs"], [])
        self.assertTrue(any("log unavailable" in w for w in verdict["warnings"]))
    def test_unreadable_log_with_failure_status_still_blocks(self):
        # Edge case: log fetch fails BUT the status itself is `failure`.
        # We can still block — the status alone is sufficient signal,
        # we don't need the log to confirm.
        with mock.patch.object(lpfc, "recent_commits_on_branch", return_value=["sha1"]):
            with mock.patch.object(
                lpfc, "combined_status",
                return_value=_stub_status(FLIP_FIXTURE["context"], "failure"),
            ):
                with mock.patch.object(lpfc, "fetch_log", return_value=None):
                    verdict = lpfc.verify_flip(FLIP_FIXTURE, "main", 5)
        self.assertEqual(len(verdict["fail_runs"]), 1)
        self.assertIn("log unavailable", verdict["fail_runs"][0]["samples"][0])
    def test_zero_runs_history_warns_allows(self):
        # No commits with a matching context — newly added workflow.
        # Allow with warning.
        with mock.patch.object(lpfc, "recent_commits_on_branch", return_value=["sha1", "sha2"]):
            with mock.patch.object(
                lpfc, "combined_status",
                return_value={"state": "success", "statuses": []},  # no matching context
            ):
                verdict = lpfc.verify_flip(FLIP_FIXTURE, "main", 5)
        self.assertEqual(verdict["checked_commits"], 0)
        self.assertEqual(verdict["fail_runs"], [])
        self.assertEqual(verdict["masked_runs"], [])
        self.assertTrue(any("no runs of" in w for w in verdict["warnings"]))
    def test_zero_commits_warns_allows(self):
        # Empty branch (newly created repo, e.g.). Allow with warning.
        with mock.patch.object(lpfc, "recent_commits_on_branch", return_value=[]):
            verdict = lpfc.verify_flip(FLIP_FIXTURE, "main", 5)
        self.assertEqual(verdict["checked_commits"], 0)
        self.assertEqual(verdict["fail_runs"], [])
        self.assertEqual(verdict["masked_runs"], [])
        self.assertTrue(any("no recent commits" in w for w in verdict["warnings"]))
 # --------------------------------------------------------------------------
 # 5. Multiple-flip aggregation in main()
 # --------------------------------------------------------------------------
 class TestMainAggregation(unittest.TestCase):
    """Tests that `main()` aggregates multiple flips and exits 1 when
    ANY one of them has a masked or red recent run. Acceptance test #4.
    We stub at the verify_flip + workflows_at_sha + _require_runtime_env
    boundary so we don't need real git or HTTP.
    """
    def setUp(self):
        # The actual env values are irrelevant — _require_runtime_env
        # is stubbed out — but the module reads OWNER/NAME at import
        # time. Patch the runtime env contract to a no-op for the
        # duration of each test.
        self._patches = [
            mock.patch.object(lpfc, "_require_runtime_env", return_value=None),
            mock.patch.object(lpfc, "BASE_REF", "main"),
            mock.patch.object(lpfc, "BASE_SHA", "deadbeefcafe"),
            mock.patch.object(lpfc, "HEAD_SHA", "feedfaceabad"),
            mock.patch.object(lpfc, "RECENT_COMMITS_N", 5),
        ]
        for p in self._patches:
            p.start()
        self.addCleanup(lambda: [p.stop() for p in self._patches])
    def test_multiple_flips_aggregated_one_bad_blocks(self):
        # PR flips 3 jobs; 1 has a recent fail → exit 1, naming that job.
        flips = [
            {"workflow_path": ".gitea/workflows/ci.yml", "workflow_name": "CI",
             "job_key": "platform-build", "job_name": "Platform (Go)",
             "context": "CI / Platform (Go) (push)"},
            {"workflow_path": ".gitea/workflows/ci.yml", "workflow_name": "CI",
             "job_key": "canvas-build", "job_name": "Canvas (Next.js)",
             "context": "CI / Canvas (Next.js) (push)"},
            {"workflow_path": ".gitea/workflows/ci.yml", "workflow_name": "CI",
             "job_key": "python-lint", "job_name": "Python Lint & Test",
             "context": "CI / Python Lint & Test (push)"},
        ]
        clean = {"flip": flips[0], "checked_commits": 5, "masked_runs": [],
                 "fail_runs": [], "warnings": []}
        bad = {"flip": flips[1], "checked_commits": 5,
               "masked_runs": [{"sha": "abc1234567", "status": "success",
                                "target_url": "/x/y/actions/runs/1/jobs/0",
                                "samples": ["--- FAIL: TestSqlmock"]}],
               "fail_runs": [], "warnings": []}
        also_clean = {"flip": flips[2], "checked_commits": 5, "masked_runs": [],
                      "fail_runs": [], "warnings": []}
        with mock.patch.object(lpfc, "workflows_at_sha", return_value={}):
            with mock.patch.object(lpfc, "detect_flips", return_value=flips):
                with mock.patch.object(lpfc, "verify_flip",
                                       side_effect=[clean, bad, also_clean]):
                    # Capture stdout to assert on naming.
                    captured = []
                    with mock.patch("builtins.print", side_effect=lambda *a, **k: captured.append(" ".join(str(x) for x in a))):
                        rc = lpfc.main([])
        self.assertEqual(rc, 1)
        # The blocking error message must name the failing job.
        joined = "\n".join(captured)
        self.assertIn("canvas-build", joined)
        # And it must mention the empirical class so a reviewer can
        # cross-link the right RFC.
        self.assertTrue("mc#664" in joined or "PR#656" in joined)
    def test_no_flips_in_diff_exits_zero(self):
        # Acceptance test #1 at main() level: empty flips → exit 0.
        with mock.patch.object(lpfc, "workflows_at_sha", return_value={}):
            with mock.patch.object(lpfc, "detect_flips", return_value=[]):
                rc = lpfc.main([])
        self.assertEqual(rc, 0)
    def test_all_flips_clean_exits_zero(self):
        flips = [{"workflow_path": ".gitea/workflows/ci.yml", "workflow_name": "CI",
                  "job_key": "platform-build", "job_name": "Platform (Go)",
                  "context": "CI / Platform (Go) (push)"}]
        clean = {"flip": flips[0], "checked_commits": 5, "masked_runs": [],
                 "fail_runs": [], "warnings": []}
        with mock.patch.object(lpfc, "workflows_at_sha", return_value={}):
            with mock.patch.object(lpfc, "detect_flips", return_value=flips):
                with mock.patch.object(lpfc, "verify_flip", return_value=clean):
                    rc = lpfc.main([])
        self.assertEqual(rc, 0)
    def test_dry_run_forces_exit_zero_even_with_bad_flip(self):
        # --dry-run never fails, even when verification finds masked runs.
        flips = [{"workflow_path": ".gitea/workflows/ci.yml", "workflow_name": "CI",
                  "job_key": "platform-build", "job_name": "Platform (Go)",
                  "context": "CI / Platform (Go) (push)"}]
        bad = {"flip": flips[0], "checked_commits": 5,
               "masked_runs": [{"sha": "abc1234567", "status": "success",
                                "target_url": "/x/y/actions/runs/1/jobs/0",
                                "samples": ["--- FAIL: TestSqlmock"]}],
               "fail_runs": [], "warnings": []}
        with mock.patch.object(lpfc, "workflows_at_sha", return_value={}):
            with mock.patch.object(lpfc, "detect_flips", return_value=flips):
                with mock.patch.object(lpfc, "verify_flip", return_value=bad):
                    rc = lpfc.main(["--dry-run"])
        self.assertEqual(rc, 0)
 # --------------------------------------------------------------------------
 # 6. Context-name rendering (the format Gitea Actions actually emits)
 # --------------------------------------------------------------------------
 class TestContextName(unittest.TestCase):
    def test_push_event(self):
        self.assertEqual(
            lpfc.context_name("CI", "Platform (Go)", "push"),
            "CI / Platform (Go) (push)",
        )
    def test_pull_request_event(self):
        self.assertEqual(
            lpfc.context_name("CI", "Platform (Go)", "pull_request"),
            "CI / Platform (Go) (pull_request)",
        )
    def test_workflow_name_falls_back_to_filename(self):
        # No top-level `name:` → falls back to filename minus extension.
        doc = {"jobs": {"foo": {"continue-on-error": True}}}
        self.assertEqual(
            lpfc.workflow_name(doc, fallback="my-workflow"),
            "my-workflow",
        )
 if __name__ == "__main__":
    unittest.main()
--- a/.gitea/scripts/tests/test_review_check.sh
+++ b/.gitea/scripts/tests/test_review_check.sh
@ -1,332 +0,0 @@
 #!/usr/bin/env bash
 # Regression tests for .gitea/scripts/review-check.sh (RFC#324 Step 1).
 #
 # Covers:
 #   T1  — open PR: script fetches PR + reviews, continues to team probe
 #   T2  — closed PR: script exits 0 (no-op)
 #   T3  — APPROVED non-author review exists → candidates exist
 #   T4  — no non-author APPROVED reviews → exit 1 (no candidates)
 #   T5  — only author reviews (no non-author APPROVE) → exit 1
 #   T6  — dismissed APPROVED review → treated as no approval
 #   T7  — team membership probe → 204 (member) → script exits 0
 #   T8  — team membership probe → 404 (not a member) → script exits 1
 #   T9  — team membership probe → 403 (token not in team) → script exits 1 (fail closed)
 #   T10 — CURL_AUTH_FILE created with mode 600 and correct header content
 #   T11 — bash syntax check (bash -n passes)
 #   T12 — jq filter: non-author APPROVED → in candidate list; dismissed → excluded
 #   T13 — missing required env GITEA_TOKEN → exits 1 with error
 #
 # Hostile-self-review (per feedback_assert_exact_not_substring):
 # this test MUST FAIL if the script is absent. Verified by running
 # the test before the file exists (covered in the PR body).
 set -euo pipefail
 THIS_DIR="$(cd "$(dirname "$0")" && pwd)"
 SCRIPT_DIR="$(cd "$THIS_DIR/.." && pwd)"
 SCRIPT="$SCRIPT_DIR/review-check.sh"
 PASS=0
 FAIL=0
 FAILED_TESTS=""
 assert_eq() {
  local label="$1"
  local expected="$2"
  local got="$3"
  if [ "$expected" = "$got" ]; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        expected: <$expected>"
    echo "        got:      <$got>"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 assert_contains() {
  local label="$1"
  local needle="$2"
  local haystack="$3"
  if printf '%s' "$haystack" | grep -qF "$needle"; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        needle:    <$needle>"
    echo "        haystack:  <$(printf '%s' "$haystack" | head -c 200)>"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 assert_file_mode() {
  local label="$1"
  local path="$2"
  local expected_mode="$3"
  if [ ! -f "$path" ]; then
    echo "  FAIL  $label (file not found: $path)"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
    return
  fi
  local got_mode
  got_mode=$(stat -c '%a' "$path" 2>/dev/null || echo "000")
  if [ "$expected_mode" = "$got_mode" ]; then
    echo "  PASS  $label (mode=$got_mode)"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label (expected mode=$expected_mode, got=$got_mode)"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 assert_file_contains() {
  local label="$1"
  local path="$2"
  local needle="$3"
  if [ ! -f "$path" ]; then
    echo "  FAIL  $label (file not found: $path)"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
    return
  fi
  if grep -qF "$needle" "$path"; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label (needle not found: <$needle>)"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 # Existence check (foundation)
 echo
 echo "== existence =="
 if [ -f "$SCRIPT" ]; then
  echo "  PASS  script exists: $SCRIPT"
  PASS=$((PASS + 1))
 else
  echo "  FAIL  script not found: $SCRIPT"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} script_exists"
  echo
  echo "------"
  echo "PASS=$PASS FAIL=$FAIL (existence)"
  echo "Cannot proceed without the script."
  exit 1
 fi
 # T11 — bash syntax check
 echo
 echo "== T11 bash syntax =="
 if bash -n "$SCRIPT" 2>&1; then
  echo "  PASS  T11 bash -n passes"
  PASS=$((PASS + 1))
 else
  echo "  FAIL  T11 bash -n failed"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} T11"
 fi
 # T13 — missing required env
 echo
 echo "== T13 missing GITEA_TOKEN =="
 set +e
 T13_OUT=$(PATH="/tmp:$PATH" GITEA_TOKEN= GITEA_HOST=git.example.com REPO=x/y PR_NUMBER=1 TEAM=qa TEAM_ID=1 bash "$SCRIPT" 2>&1 || true)
 set -e
 assert_contains "T13 exits non-zero when GITEA_TOKEN missing" "GITEA_TOKEN required" "$T13_OUT"
 # Start fixture HTTP server
 echo
 echo "== fixture setup =="
 FIXTURE_DIR=$(mktemp -d)
 trap 'rm -rf "$FIXTURE_DIR"; [ -n "${FIX_PID:-}" ] && kill "$FIX_PID" 2>/dev/null || true' EXIT
 FIXTURE_PY="$THIS_DIR/_review_check_fixture.py"
 if [ ! -f "$FIXTURE_PY" ]; then
  echo "::error::fixture server $FIXTURE_PY missing"
  exit 1
 fi
 FIX_LOG="$FIXTURE_DIR/fixture.log"
 FIX_STATE_DIR="$FIXTURE_DIR/state"
 mkdir -p "$FIX_STATE_DIR"
 # Find an unused port
 FIX_PORT=$(python3 -c 'import socket;s=socket.socket();s.bind(("127.0.0.1",0));print(s.getsockname()[1]);s.close()')
 FIXTURE_STATE_DIR="$FIX_STATE_DIR" python3 "$FIXTURE_PY" "$FIX_PORT" \
  >"$FIX_LOG" 2>&1 &
 FIX_PID=$!
 # Wait for fixture readiness
 for _ in $(seq 1 50); do
  if curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
    break
  fi
  sleep 0.1
 done
 if ! curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
  echo "::error::fixture server failed to start. Log:"
  cat "$FIX_LOG"
  exit 1
 fi
 echo "  fixture running on port $FIX_PORT"
 # Install a curl shim that rewrites https://fixture.local/* -> http://127.0.0.1:$FIX_PORT/*
 # Use double-quoted heredoc so FIX_PORT is expanded into the shim at creation time.
 mkdir -p "$FIXTURE_DIR/bin"
 cat >"$FIXTURE_DIR/bin/curl" <<"CURL_SHIM"
 #!/usr/bin/env bash
 # Shim: rewrite https://fixture.local/* -> http://127.0.0.1:FIXPORT/*
 # Generated at test-run time; FIXPORT is substituted when this file is written.
 new_args=()
 for a in "$@"; do
  if [[ "$a" == https://fixture.local/* ]]; then
    rest="${a#https://fixture.local}"
    a="http://127.0.0.1:FIXPORT${rest}"
  fi
  new_args+=("$a")
 done
 exec /usr/bin/curl "${new_args[@]}"
 CURL_SHIM
 # Now substitute FIXPORT with the actual port number
 sed -i "s/FIXPORT/${FIX_PORT}/g" "$FIXTURE_DIR/bin/curl"
 chmod +x "$FIXTURE_DIR/bin/curl"
 # Helper: run the script with fixture environment
 run_review_check() {
  local scenario="$1"
  echo "$scenario" >"$FIX_STATE_DIR/scenario"
  local out
  set +e
  out=$(
    PATH="$FIXTURE_DIR/bin:/tmp:$PATH" \
    GITEA_TOKEN="fixture-token" \
    GITEA_HOST="fixture.local" \
    REPO="molecule-ai/molecule-core" \
    PR_NUMBER="999" \
    TEAM="qa" \
    TEAM_ID="20" \
    REVIEW_CHECK_DEBUG="0" \
    REVIEW_CHECK_STRICT="0" \
    bash "$SCRIPT" 2>&1
  )
  local rc=$?
  set -e
  echo "$out" >"$FIX_STATE_DIR/last_run.log"
  echo "$rc" >"$FIX_STATE_DIR/last_rc"
  echo "$out"
 }
 # T1 — open PR: script fetches PR and continues
 echo
 echo "== T1 open PR =="
 T1_OUT=$(run_review_check "T1_pr_open")
 T1_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T1 exit code 0 (approver exists + team member)" "0" "$T1_RC"
 assert_contains "T1 qa-review APPROVED by core-devops" "APPROVED by core-devops" "$T1_OUT"
 # T2 — closed PR: exits 0 immediately (no-op)
 echo
 echo "== T2 closed PR =="
 T2_OUT=$(run_review_check "T2_pr_closed")
 T2_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T2 exit code 0 (closed PR no-op)" "0" "$T2_RC"
 # T3 — APPROVED non-author reviews exist
 echo
 echo "== T3 approved non-author reviews =="
 T3_OUT=$(run_review_check "T3_reviews_approved_non_author")
 T3_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T3 exit code 0 (candidates + team member)" "0" "$T3_RC"
 # T4 — no non-author APPROVED reviews → exit 1
 echo
 echo "== T4 no non-author APPROVED reviews =="
 T4_OUT=$(run_review_check "T4_reviews_empty")
 T4_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T4 exit code 1 (no candidates)" "1" "$T4_RC"
 assert_contains "T4 awaiting non-author APPROVE" "awaiting non-author APPROVE" "$T4_OUT"
 # T5 — only author reviews → exit 1
 echo
 echo "== T5 only author reviews =="
 T5_OUT=$(run_review_check "T5_reviews_only_author")
 T5_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T5 exit code 1 (only author reviews, no candidates)" "1" "$T5_RC"
 # T6 — dismissed APPROVED review → treated as no approval
 echo
 echo "== T6 dismissed APPROVED review =="
 T6_OUT=$(run_review_check "T6_reviews_dismissed")
 T6_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T6 exit code 1 (dismissed = no approval)" "1" "$T6_RC"
 # T7 — team member → exit 0
 echo
 echo "== T7 team membership 204 (member) =="
 T7_OUT=$(run_review_check "T7_team_member")
 T7_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T7 exit code 0 (member, APPROVED)" "0" "$T7_RC"
 assert_contains "T7 APPROVED by core-devops (team member)" "APPROVED by core-devops" "$T7_OUT"
 # T8 — not a team member → exit 1 (fail closed)
 echo
 echo "== T8 team membership 404 (not a member) =="
 T8_OUT=$(run_review_check "T8_team_not_member")
 T8_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T8 exit code 1 (not in team)" "1" "$T8_RC"
 # T9 — 403 token-not-in-team → exit 1 (fail closed)
 echo
 echo "== T9 team membership 403 (token not in team) =="
 T9_OUT=$(run_review_check "T9_team_403")
 T9_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T9 exit code 1 (403 token-not-in-team, fail closed)" "1" "$T9_RC"
 assert_contains "T9 403 error in output" "403" "$T9_OUT"
 # T10 — token file creation and permissions
 echo
 echo "== T10 CURL_AUTH_FILE =="
 # Verify the token-file logic directly: create a temp file with the
 # same mktemp pattern, write the header with printf, chmod 600, then assert.
 T10_TOKEN="secret-test-token-abc123"
 T10_AUTHFILE=$(mktemp -p /tmp curl-auth.test.XXXXXX)
 chmod 600 "$T10_AUTHFILE"
 printf 'header = "Authorization: token %s"\n' "$T10_TOKEN" > "$T10_AUTHFILE"
 assert_file_mode "T10a mktemp -p /tmp mode 600 (CURL_AUTH_FILE pattern)" "$T10_AUTHFILE" "600"
 assert_file_contains "T10b printf header format (CURL_AUTH_FILE content)" "$T10_AUTHFILE" "Authorization: token secret-test-token-abc123"
 assert_file_contains "T10c 'header =' curl-config syntax" "$T10_AUTHFILE" 'header = "Authorization: token '
 rm -f "$T10_AUTHFILE"
 # T12 — jq filter: non-author APPROVED included, dismissed excluded
 echo
 echo "== T12 jq filter =="
 # These are tested indirectly via T3 and T6 above, but let's also test
 # the jq expression directly.
 JQ_FILTER='.[]
  | select(.state == "APPROVED")
  | select(.dismissed != true)
  | select(.user.login != "alice")
  | .user.login'
 T12_INPUT='[{"state":"APPROVED","dismissed":false,"user":{"login":"core-devops"}},{"state":"CHANGES_REQUESTED","dismissed":false,"user":{"login":"bob"}},{"state":"APPROVED","dismissed":false,"user":{"login":"alice"}},{"state":"APPROVED","dismissed":true,"user":{"login":"carol"}}]'
 JQ_CMD=$(command -v jq 2>/dev/null || echo /tmp/jq)
 T12_CANDIDATES=$(echo "$T12_INPUT" | "$JQ_CMD" -r "$JQ_FILTER" 2>/dev/null | sort -u)
 assert_contains "T12 jq: core-devops (non-author APPROVED) in candidates" "core-devops" "$T12_CANDIDATES"
 assert_eq "T12 jq: alice (author) NOT in candidates" "" "$(echo "$T12_CANDIDATES" | grep '^alice$' || true)"
 assert_eq "T12 jq: carol (dismissed) NOT in candidates" "" "$(echo "$T12_CANDIDATES" | grep '^carol$' || true)"
 echo
 echo "------"
 echo "PASS=$PASS FAIL=$FAIL"
 if [ "$FAIL" -gt 0 ]; then
  echo "Failed:$FAILED_TESTS"
 fi
 [ "$FAIL" -eq 0 ]
--- a/.gitea/scripts/tests/test_sop_checklist_gate.py
+++ b/.gitea/scripts/tests/test_sop_checklist_gate.py
@ -1,524 +0,0 @@
 #!/usr/bin/env python3
 # Unit tests for sop-checklist-gate.py
 #
 # Run:  python3 .gitea/scripts/tests/test_sop_checklist_gate.py
 #   or:  pytest .gitea/scripts/tests/test_sop_checklist_gate.py
 #
 # RFC#351 Step 2 of 6 — implementation MVP. Tests cover:
 #   - slug normalization (the 4 example variants in the script header)
 #   - parse_directives (ack, revoke, with/without note, mid-comment, etc.)
 #   - section_marker_present (empty answer rejected, filled answer ok)
 #   - compute_ack_state (self-ack rejected, team probe applied, revoke
 #     invalidates own prior ack, peer's ack survives unrevoked)
 #   - render_status (state + description format)
 #   - get_tier_mode (label-driven, default fallback)
 #   - load_config (default config parses cleanly with both PyYAML and
 #     the bundled minimal parser)
 #
 # All tests run WITHOUT touching the Gitea API — the team-probe
 # callable is dependency-injected.
 from __future__ import annotations
 import os
 import sys
 import tempfile
 import unittest
 # Resolve sibling script regardless of where pytest is invoked from.
 HERE = os.path.dirname(os.path.abspath(__file__))
 PARENT = os.path.dirname(HERE)  # .gitea/scripts
 sys.path.insert(0, PARENT)
 import importlib.util  # noqa: E402
 _spec = importlib.util.spec_from_file_location(
    "sop_checklist_gate", os.path.join(PARENT, "sop-checklist-gate.py")
 )
 sop = importlib.util.module_from_spec(_spec)
 _spec.loader.exec_module(sop)  # type: ignore[union-attr]
 # ---------------------------------------------------------------------------
 # Test fixtures
 # ---------------------------------------------------------------------------
 CONFIG_PATH = os.path.join(PARENT, "..", "sop-checklist-config.yaml")
 def _items() -> list[dict]:
    cfg = sop.load_config(CONFIG_PATH)
    return cfg["items"]
 def _items_by_slug() -> dict[str, dict]:
    return {it["slug"]: it for it in _items()}
 def _numeric_aliases() -> dict[int, str]:
    return {
        int(it["numeric_alias"]): it["slug"]
        for it in _items()
        if it.get("numeric_alias")
    }
 def _comment(user: str, body: str) -> dict:
    return {"user": {"login": user}, "body": body}
 # ---------------------------------------------------------------------------
 # normalize_slug
 # ---------------------------------------------------------------------------
 class TestNormalizeSlug(unittest.TestCase):
    def test_kebab_already(self):
        self.assertEqual(sop.normalize_slug("comprehensive-testing"), "comprehensive-testing")
    def test_underscore_to_dash(self):
        self.assertEqual(sop.normalize_slug("comprehensive_testing"), "comprehensive-testing")
    def test_space_to_dash(self):
        self.assertEqual(sop.normalize_slug("comprehensive testing"), "comprehensive-testing")
    def test_uppercase_to_lower(self):
        self.assertEqual(sop.normalize_slug("Comprehensive-Testing"), "comprehensive-testing")
    def test_mixed_separators(self):
        self.assertEqual(sop.normalize_slug("Comprehensive_Testing"), "comprehensive-testing")
        self.assertEqual(sop.normalize_slug("FIVE_axis review"), "five-axis-review")
    def test_collapse_repeated_dashes(self):
        self.assertEqual(sop.normalize_slug("comprehensive--testing"), "comprehensive-testing")
        self.assertEqual(sop.normalize_slug("comprehensive  testing"), "comprehensive-testing")
    def test_strip_trailing_punctuation(self):
        self.assertEqual(sop.normalize_slug("comprehensive-testing."), "comprehensive-testing")
        self.assertEqual(sop.normalize_slug("comprehensive-testing!"), "comprehensive-testing")
    def test_numeric_shorthand_known(self):
        self.assertEqual(
            sop.normalize_slug("1", _numeric_aliases()),
            "comprehensive-testing",
        )
        self.assertEqual(
            sop.normalize_slug("3", _numeric_aliases()),
            "staging-smoke",
        )
        self.assertEqual(
            sop.normalize_slug("7", _numeric_aliases()),
            "memory-consulted",
        )
    def test_numeric_shorthand_unknown_returns_empty(self):
        # "8" is out of range → empty so caller can flag as unparseable.
        self.assertEqual(sop.normalize_slug("8", _numeric_aliases()), "")
    def test_numeric_without_alias_table_keeps_digits(self):
        # No alias table → return the digits as-is.
        self.assertEqual(sop.normalize_slug("1"), "1")
    def test_empty_input(self):
        self.assertEqual(sop.normalize_slug(""), "")
        self.assertEqual(sop.normalize_slug("   "), "")
        self.assertEqual(sop.normalize_slug(None), "")
 # ---------------------------------------------------------------------------
 # parse_directives
 # ---------------------------------------------------------------------------
 class TestParseDirectives(unittest.TestCase):
    def setUp(self):
        self.aliases = _numeric_aliases()
    def test_simple_ack(self):
        d = sop.parse_directives("/sop-ack comprehensive-testing", self.aliases)
        self.assertEqual(d, [("sop-ack", "comprehensive-testing", "")])
    def test_simple_revoke(self):
        d = sop.parse_directives("/sop-revoke staging-smoke", self.aliases)
        self.assertEqual(d, [("sop-revoke", "staging-smoke", "")])
    def test_ack_with_note(self):
        d = sop.parse_directives(
            "/sop-ack comprehensive-testing LGTM the test covers all edge cases",
            self.aliases,
        )
        self.assertEqual(len(d), 1)
        self.assertEqual(d[0][0], "sop-ack")
        self.assertEqual(d[0][1], "comprehensive-testing")
        self.assertIn("LGTM", d[0][2])
    def test_numeric_shorthand(self):
        d = sop.parse_directives("/sop-ack 1", self.aliases)
        self.assertEqual(d, [("sop-ack", "comprehensive-testing", "")])
    def test_revoke_with_reason(self):
        d = sop.parse_directives(
            "/sop-revoke comprehensive-testing realized the e2e was mocking the DB",
            self.aliases,
        )
        self.assertEqual(d[0][0], "sop-revoke")
        self.assertEqual(d[0][1], "comprehensive-testing")
        self.assertIn("mocking", d[0][2])
    def test_directive_in_middle_of_comment(self):
        body = (
            "Reviewed the PR, looks good overall.\n"
            "/sop-ack comprehensive-testing\n"
            "Will follow up on the doc nit separately."
        )
        d = sop.parse_directives(body, self.aliases)
        self.assertEqual(len(d), 1)
        self.assertEqual(d[0][1], "comprehensive-testing")
    def test_multiple_directives_in_one_comment(self):
        body = (
            "/sop-ack comprehensive-testing\n"
            "/sop-ack local-postgres-e2e\n"
        )
        d = sop.parse_directives(body, self.aliases)
        self.assertEqual(len(d), 2)
        slugs = {x[1] for x in d}
        self.assertEqual(slugs, {"comprehensive-testing", "local-postgres-e2e"})
    def test_must_be_at_line_start(self):
        # A directive embedded mid-line is not honored (prevents review
        # comments like "to /sop-ack you need..." from acting as acks).
        body = "If you want to /sop-ack comprehensive-testing reply in this thread"
        d = sop.parse_directives(body, self.aliases)
        self.assertEqual(d, [])
    def test_leading_whitespace_allowed(self):
        body = "  /sop-ack comprehensive-testing"
        d = sop.parse_directives(body, self.aliases)
        self.assertEqual(len(d), 1)
    def test_empty_body(self):
        self.assertEqual(sop.parse_directives("", self.aliases), [])
        self.assertEqual(sop.parse_directives(None, self.aliases), [])
    def test_normalization_applied(self):
        # /sop-ack Comprehensive_Testing → canonical comprehensive-testing
        d = sop.parse_directives("/sop-ack Comprehensive_Testing", self.aliases)
        self.assertEqual(d[0][1], "comprehensive-testing")
 # ---------------------------------------------------------------------------
 # section_marker_present
 # ---------------------------------------------------------------------------
 class TestSectionMarkerPresent(unittest.TestCase):
    def test_marker_with_inline_answer(self):
        body = "- [ ] **Comprehensive testing performed**: Added 12 new tests covering null/empty/giant inputs."
        self.assertTrue(sop.section_marker_present(body, "Comprehensive testing performed"))
    def test_marker_with_empty_answer(self):
        body = "- [ ] **Comprehensive testing performed**:"
        self.assertFalse(sop.section_marker_present(body, "Comprehensive testing performed"))
    def test_marker_with_only_whitespace_answer(self):
        body = "- [ ] **Comprehensive testing performed**:    \n"
        self.assertFalse(sop.section_marker_present(body, "Comprehensive testing performed"))
    def test_marker_with_next_line_answer(self):
        body = (
            "- [ ] **Comprehensive testing performed**:\n"
            "      Yes — see attached log + 12 new unit tests in foo_test.py.\n"
        )
        self.assertTrue(sop.section_marker_present(body, "Comprehensive testing performed"))
    def test_marker_missing(self):
        body = "- [ ] **Local-postgres E2E run**: N/A — pure-frontend\n"
        self.assertFalse(sop.section_marker_present(body, "Comprehensive testing performed"))
    def test_case_insensitive_marker_match(self):
        body = "- [ ] **comprehensive TESTING performed**: yes"
        self.assertTrue(sop.section_marker_present(body, "Comprehensive testing performed"))
    def test_empty_body(self):
        self.assertFalse(sop.section_marker_present("", "X"))
        self.assertFalse(sop.section_marker_present(None, "X"))
 # ---------------------------------------------------------------------------
 # compute_ack_state
 # ---------------------------------------------------------------------------
 class TestComputeAckState(unittest.TestCase):
    def setUp(self):
        self.items = _items_by_slug()
        self.aliases = _numeric_aliases()
    @staticmethod
    def _approve_all(slug, users):
        return list(users)
    @staticmethod
    def _approve_none(slug, users):
        return []
    def _approve_only(self, allowed_users):
        return lambda slug, users: [u for u in users if u in allowed_users]
    def test_peer_ack_passes(self):
        comments = [_comment("bob", "/sop-ack comprehensive-testing")]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
    def test_self_ack_rejected(self):
        comments = [_comment("alice", "/sop-ack comprehensive-testing")]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], [])
        self.assertEqual(state["comprehensive-testing"]["rejected"]["self_ack"], ["alice"])
    def test_not_in_team_rejected(self):
        comments = [_comment("eve", "/sop-ack comprehensive-testing")]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_none
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], [])
        self.assertEqual(state["comprehensive-testing"]["rejected"]["not_in_team"], ["eve"])
    def test_revoke_invalidates_own_prior_ack(self):
        # Bob acks then later revokes — Bob no longer counts.
        comments = [
            _comment("bob", "/sop-ack comprehensive-testing"),
            _comment("bob", "/sop-revoke comprehensive-testing realized e2e was mocked"),
        ]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], [])
    def test_revoke_does_not_affect_others_acks(self):
        # Bob revokes his own ack; Carol's still counts.
        comments = [
            _comment("bob", "/sop-ack comprehensive-testing"),
            _comment("carol", "/sop-ack comprehensive-testing"),
            _comment("bob", "/sop-revoke comprehensive-testing"),
        ]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], ["carol"])
    def test_ack_after_revoke_restored(self):
        # Bob revokes then re-acks (e.g. after re-reviewing).
        comments = [
            _comment("bob", "/sop-ack comprehensive-testing"),
            _comment("bob", "/sop-revoke comprehensive-testing"),
            _comment("bob", "/sop-ack comprehensive-testing"),
        ]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
    def test_numeric_shorthand_ack(self):
        # /sop-ack 1 → comprehensive-testing
        comments = [_comment("bob", "/sop-ack 1")]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
    def test_ack_for_unknown_slug_ignored(self):
        # Some other slug not in config — silently drop (doesn't crash).
        comments = [_comment("bob", "/sop-ack does-not-exist")]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        for slug in self.items:
            self.assertEqual(state[slug]["ackers"], [])
    def test_multi_item_multi_user(self):
        comments = [
            _comment("bob", "/sop-ack comprehensive-testing\n/sop-ack staging-smoke"),
            _comment("carol", "/sop-ack five-axis-review"),
        ]
        state = sop.compute_ack_state(
            comments, "alice", self.items, self.aliases, self._approve_all
        )
        self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
        self.assertEqual(state["staging-smoke"]["ackers"], ["bob"])
        self.assertEqual(state["five-axis-review"]["ackers"], ["carol"])
        self.assertEqual(state["root-cause"]["ackers"], [])
 # ---------------------------------------------------------------------------
 # render_status
 # ---------------------------------------------------------------------------
 class TestRenderStatus(unittest.TestCase):
    def setUp(self):
        self.items = _items()
        self.items_by_slug = _items_by_slug()
    def _state_with(self, acked: list[str]) -> dict:
        return {
            it["slug"]: {
                "ackers": ["peer"] if it["slug"] in acked else [],
                "rejected": {"self_ack": [], "not_in_team": []},
            }
            for it in self.items
        }
    def test_all_acked_returns_success(self):
        all_slugs = [it["slug"] for it in self.items]
        state, desc = sop.render_status(
            self.items, self._state_with(all_slugs), {s: True for s in all_slugs}
        )
        self.assertEqual(state, "success")
        self.assertIn("7/7", desc)
    def test_partial_acked_returns_failure(self):
        state, desc = sop.render_status(
            self.items,
            self._state_with(["comprehensive-testing", "staging-smoke"]),
            {it["slug"]: True for it in self.items},
        )
        self.assertEqual(state, "failure")
        self.assertIn("2/7", desc)
        self.assertIn("missing", desc)
    def test_description_truncates_long_missing_list(self):
        # Only ack one — 6 missing should be summarized as "+N".
        state, desc = sop.render_status(
            self.items,
            self._state_with(["comprehensive-testing"]),
            {it["slug"]: True for it in self.items},
        )
        # Length budget: under 140 chars.
        self.assertLessEqual(len(desc), 140)
        self.assertIn("+", desc)  # +N elision marker
    def test_body_unfilled_surfaced(self):
        all_slugs = [it["slug"] for it in self.items]
        state, desc = sop.render_status(
            self.items,
            self._state_with(all_slugs),
            {it["slug"]: False for it in self.items},
        )
        self.assertIn("body-unfilled", desc)
 # ---------------------------------------------------------------------------
 # get_tier_mode
 # ---------------------------------------------------------------------------
 class TestGetTierMode(unittest.TestCase):
    def setUp(self):
        self.cfg = sop.load_config(CONFIG_PATH)
    def test_tier_high_is_hard(self):
        pr = {"labels": [{"name": "tier:high"}, {"name": "area:ci"}]}
        self.assertEqual(sop.get_tier_mode(pr, self.cfg), "hard")
    def test_tier_medium_is_hard(self):
        pr = {"labels": [{"name": "tier:medium"}]}
        self.assertEqual(sop.get_tier_mode(pr, self.cfg), "hard")
    def test_tier_low_is_soft(self):
        pr = {"labels": [{"name": "tier:low"}]}
        self.assertEqual(sop.get_tier_mode(pr, self.cfg), "soft")
    def test_no_tier_label_defaults_to_hard(self):
        # Per feedback_fix_root_not_symptom — never silently lower the bar.
        pr = {"labels": [{"name": "area:ci"}]}
        self.assertEqual(sop.get_tier_mode(pr, self.cfg), "hard")
    def test_no_labels_defaults_to_hard(self):
        self.assertEqual(sop.get_tier_mode({"labels": []}, self.cfg), "hard")
        self.assertEqual(sop.get_tier_mode({}, self.cfg), "hard")
 # ---------------------------------------------------------------------------
 # load_config
 # ---------------------------------------------------------------------------
 class TestLoadConfig(unittest.TestCase):
    def test_default_config_parses(self):
        cfg = sop.load_config(CONFIG_PATH)
        self.assertIn("items", cfg)
        self.assertEqual(len(cfg["items"]), 7)
        slugs = {it["slug"] for it in cfg["items"]}
        self.assertEqual(
            slugs,
            {
                "comprehensive-testing",
                "local-postgres-e2e",
                "staging-smoke",
                "root-cause",
                "five-axis-review",
                "no-backwards-compat",
                "memory-consulted",
            },
        )
    def test_default_config_tier_mode_shape(self):
        cfg = sop.load_config(CONFIG_PATH)
        self.assertEqual(cfg["tier_failure_mode"]["tier:high"], "hard")
        self.assertEqual(cfg["tier_failure_mode"]["tier:medium"], "hard")
        self.assertEqual(cfg["tier_failure_mode"]["tier:low"], "soft")
        self.assertEqual(cfg["default_mode"], "hard")
    def test_each_item_has_required_fields(self):
        cfg = sop.load_config(CONFIG_PATH)
        for it in cfg["items"]:
            self.assertIn("slug", it)
            self.assertIn("numeric_alias", it)
            self.assertIn("pr_section_marker", it)
            self.assertIn("required_teams", it)
            self.assertIsInstance(it["required_teams"], list)
            self.assertGreater(len(it["required_teams"]), 0)
 # ---------------------------------------------------------------------------
 # Edge case: full integration without team probe (dependency-injected)
 # ---------------------------------------------------------------------------
 class TestEndToEndAckFlow(unittest.TestCase):
    """All-7-items happy path with synthetic comments. Verifies the
    full pipeline minus the Gitea API."""
    def test_all_seven_acked_by_proper_teams(self):
        items = _items_by_slug()
        aliases = _numeric_aliases()
        comments = [
            _comment("qa-bot", "/sop-ack comprehensive-testing"),
            _comment("eng-bot", "/sop-ack local-postgres-e2e"),
            _comment("eng-bot", "/sop-ack staging-smoke"),
            _comment("mgr-bot", "/sop-ack root-cause"),
            _comment("eng-bot", "/sop-ack five-axis-review"),
            _comment("mgr-bot", "/sop-ack no-backwards-compat"),
            _comment("eng-bot", "/sop-ack memory-consulted"),
        ]
        def probe(slug, users):
            # Pretend every user is in every team.
            return list(users)
        state = sop.compute_ack_state(comments, "alice-author", items, aliases, probe)
        body = {it["slug"]: True for it in items.values()}
        items_list = list(items.values())
        result_state, desc = sop.render_status(items_list, state, body)
        self.assertEqual(result_state, "success")
        self.assertIn("7/7", desc)
 if __name__ == "__main__":
    unittest.main(verbosity=2)
--- a/.gitea/scripts/tests/test_sop_tier_check_clause_split.sh
+++ b/.gitea/scripts/tests/test_sop_tier_check_clause_split.sh
@ -1,101 +0,0 @@
 #!/usr/bin/env bash
 # Regression test for #229 — sop-tier-check tier:low OR-clause splitter.
 #
 # Bug (PR #225 → still broken after PR #231):
 #   Line ~289 of sop-tier-check.sh used:
 #     _clause=$(echo "$_raw_clause" | tr -d '()' | tr ',' '\n' | tr -d '[:space:]' | grep -v '^$')
 #   `tr -d '[:space:]'` strips the newlines that `tr ',' '\n'` just
 #   inserted, collapsing "engineers,managers,ceo" into a single token
 #   "engineersmanagersceo". The for-loop then iterates ONCE on a name
 #   that matches no team, so every tier:low PR fails:
 #     ::error::clause [engineers/managers/ceo]: FAIL — no approving
 #     reviewer belongs to any of these teamsengineersmanagersceo
 #   (note also: missing separators in the error string is bug #2 —
 #    `_clause_names` used "${var:+, }$x" which OVERWRITES per iteration).
 #
 # Fix shape (this PR):
 #   _no_parens=${_raw_clause//[()]/}
 #   _clause=${_no_parens//,/ }    # comma -> space, bash word-split iterates
 #   _clause_names="${_clause_names}${_clause_names:+, }${_t}"  # APPEND, not overwrite
 #
 # This test extracts the splitter logic and asserts it produces the right
 # token list for each of the three tier expressions live in the script.
 set -euo pipefail
 PASS=0
 FAIL=0
 assert_eq() {
  local label="$1"
  local expected="$2"
  local got="$3"
  if [ "$expected" = "$got" ]; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        expected: <$expected>"
    echo "        got:      <$got>"
    FAIL=$((FAIL + 1))
  fi
 }
 # ----- Splitter under test (mirrors the fixed sop-tier-check.sh block) -----
 split_clause() {
  local raw="$1"
  local no_parens=${raw//[()]/}
  local clause=${no_parens//,/ }
  local out=""
  for _t in $clause; do
    out="${out}${out:+|}$_t"
  done
  echo "$out"
 }
 echo "test: tier:low OR-clause splits to 3 tokens"
 assert_eq "tier:low" "engineers|managers|ceo" "$(split_clause "engineers,managers,ceo")"
 echo "test: tier:medium AND-expression — bash word-split on \$EXPR yields 5 tokens"
 EXPR="managers AND engineers AND qa???,security???"
 out=""
 for _raw in $EXPR; do
  out="${out}${out:+ ; }$(split_clause "$_raw")"
 done
 assert_eq "tier:medium" "managers ; AND ; engineers ; AND ; qa???|security???" "$out"
 echo "test: tier:high single-team OR-clause"
 assert_eq "tier:high" "ceo" "$(split_clause "ceo")"
 echo "test: paren-wrapped OR-set unwraps + splits"
 assert_eq "paren OR" "managers|ceo" "$(split_clause "(managers,ceo)")"
 # ----- _clause_names accumulator (was overwriting per iteration) -----
 acc=""
 for t in engineers managers ceo; do
  acc="${acc}${acc:+, }${t}"
 done
 assert_eq "_clause_names append" "engineers, managers, ceo" "$acc"
 # ----- _failed_clauses / _passed_clauses accumulator across raw clauses -----
 acc=""
 for c in clauseA clauseB clauseC; do
  acc="${acc}${acc:+, }${c}"
 done
 assert_eq "_failed_clauses append" "clauseA, clauseB, clauseC" "$acc"
 # ----- End-to-end OR-gate: simulate APPROVER_TEAMS[core-lead]=' managers ' -----
 # The script's case pattern is *${_t}* with a space-padded value.
 APPROVER_TEAMS_VAL=" managers "
 matched=""
 for _t in $(split_clause "engineers,managers,ceo" | tr '|' ' '); do
  case "$APPROVER_TEAMS_VAL" in
    *${_t}*) matched="$_t"; break ;;
  esac
 done
 assert_eq "OR-gate matches managers" "managers" "$matched"
 echo
 echo "------"
 echo "PASS=$PASS FAIL=$FAIL"
 [ "$FAIL" -eq 0 ]
--- a/.gitea/scripts/tests/test_sop_tier_refire.sh
+++ b/.gitea/scripts/tests/test_sop_tier_refire.sh
@ -1,297 +0,0 @@
 #!/usr/bin/env bash
 # Tests for sop-tier-refire.{yml,sh} — internal#292.
 #
 # Behavior matrix:
 #
 #   T1: PR open + APPROVED via tier:low → script invokes sop-tier-check
 #       and POSTs status=success.
 #   T2: PR open + missing tier label → sop-tier-check exits non-zero;
 #       refire POSTs status=failure (description mentions failure).
 #   T3: PR open + tier:low but NO approving reviews → sop-tier-check
 #       exits non-zero; refire POSTs status=failure.
 #   T4: PR CLOSED → refire exits 0 with no status POST (no-op on closed).
 #   T5: Rate-limit — recent status update within 30s → refire skips,
 #       no new POST.
 #   T6 (yaml-lint): workflow `if:` expression contains author_association
 #       gate + slash-command-trigger gate + PR-not-issue gate.
 #   T7 (yaml-lint): workflow file is parseable YAML.
 #
 # Tests T1-T5 run the real script against a local-fixture HTTP server
 # (python http.server with a stub handler — `tests/_refire_fixture.py`)
 # so the script's Gitea API calls hit the fixture, not the real Gitea.
 #
 # Tests T6/T7 are pure YAML checks against the workflow file.
 #
 # Hostile-self-review (per feedback_assert_exact_not_substring):
 # this test MUST FAIL if the workflow or script is absent. Verified by
 # running the test before the files exist (covered in the PR body).
 set -euo pipefail
 THIS_DIR="$(cd "$(dirname "$0")" && pwd)"
 SCRIPT_DIR="$(cd "$THIS_DIR/.." && pwd)"
 WORKFLOW_DIR="$(cd "$THIS_DIR/../../workflows" && pwd)"
 WORKFLOW="$WORKFLOW_DIR/sop-tier-refire.yml"
 SCRIPT="$SCRIPT_DIR/sop-tier-refire.sh"
 PASS=0
 FAIL=0
 FAILED_TESTS=""
 assert_eq() {
  local label="$1"
  local expected="$2"
  local got="$3"
  if [ "$expected" = "$got" ]; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        expected: <$expected>"
    echo "        got:      <$got>"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 assert_contains() {
  local label="$1"
  local needle="$2"
  local haystack="$3"
  if printf '%s' "$haystack" | grep -qF "$needle"; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        needle:    <$needle>"
    echo "        haystack:  <$(printf '%s' "$haystack" | head -c 400)>"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 assert_file_exists() {
  local label="$1"
  local path="$2"
  if [ -f "$path" ]; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label (not found: $path)"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 # Existence (foundation — every other test depends on these)
 echo
 echo "== existence =="
 assert_file_exists "workflow file exists"  "$WORKFLOW"
 assert_file_exists "script file exists"    "$SCRIPT"
 if [ "$FAIL" -gt 0 ]; then
  echo
  echo "------"
  echo "PASS=$PASS FAIL=$FAIL (existence)"
  echo "Cannot proceed without these files."
  exit 1
 fi
 # T6 / T7 — workflow YAML structure
 echo
 echo "== T6/T7 workflow yaml =="
 # YAML parseability
 PARSE_OUT=$(python3 -c 'import sys,yaml;yaml.safe_load(open(sys.argv[1]).read());print("ok")' "$WORKFLOW" 2>&1 || true)
 assert_eq "T7 workflow parses as YAML" "ok" "$PARSE_OUT"
 # Three required gates in the `if:` expression
 WORKFLOW_CONTENT=$(cat "$WORKFLOW")
 assert_contains "T6a workflow if: contains author_association gate" \
  "github.event.comment.author_association" "$WORKFLOW_CONTENT"
 assert_contains "T6b workflow if: gates on MEMBER/OWNER/COLLABORATOR" \
  '["MEMBER","OWNER","COLLABORATOR"]' "$WORKFLOW_CONTENT"
 assert_contains "T6c workflow if: contains slash-command trigger" \
  "/refire-tier-check" "$WORKFLOW_CONTENT"
 assert_contains "T6d workflow if: gates on PR-not-issue" \
  "github.event.issue.pull_request" "$WORKFLOW_CONTENT"
 assert_contains "T6e workflow listens on issue_comment" \
  "issue_comment" "$WORKFLOW_CONTENT"
 assert_contains "T6f workflow requests statuses:write permission" \
  "statuses: write" "$WORKFLOW_CONTENT"
 # Does NOT check out PR HEAD (security)
 if grep -q 'ref: \${{ github.event.pull_request.head' "$WORKFLOW"; then
  echo "  FAIL  T6g workflow MUST NOT check out PR head (security)"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} T6g"
 else
  echo "  PASS  T6g workflow does not check out PR head"
  PASS=$((PASS + 1))
 fi
 # T1-T5 — script behavior against a local Gitea-fixture
 echo
 echo "== T1-T5 script behavior (vs local fixture) =="
 # Spin up the fixture HTTP server.
 FIXTURE_DIR=$(mktemp -d)
 trap 'rm -rf "$FIXTURE_DIR"; [ -n "${FIX_PID:-}" ] && kill "$FIX_PID" 2>/dev/null || true' EXIT
 FIXTURE_PY="$THIS_DIR/_refire_fixture.py"
 if [ ! -f "$FIXTURE_PY" ]; then
  echo "::error::fixture server $FIXTURE_PY missing"
  exit 1
 fi
 FIX_LOG="$FIXTURE_DIR/fixture.log"
 FIX_STATE_DIR="$FIXTURE_DIR/state"
 mkdir -p "$FIX_STATE_DIR"
 # Find an unused port.
 FIX_PORT=$(python3 -c 'import socket;s=socket.socket();s.bind(("127.0.0.1",0));print(s.getsockname()[1]);s.close()')
 FIXTURE_STATE_DIR="$FIX_STATE_DIR" python3 "$FIXTURE_PY" "$FIX_PORT" \
  >"$FIX_LOG" 2>&1 &
 FIX_PID=$!
 # Wait for fixture readiness.
 for _ in $(seq 1 50); do
  if curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
    break
  fi
  sleep 0.1
 done
 if ! curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
  echo "::error::fixture server failed to start. Log:"
  cat "$FIX_LOG"
  exit 1
 fi
 # Helper: set fixture state for a scenario, then run the script.
 # tier_result is one of: pass | fail_no_label | fail_no_approvals.
 # The refire script's tier-check invocation is mocked because the real
 # sop-tier-check.sh uses bash 4+ associative arrays — incompatible with
 # the macOS bash 3.2 dev shell. Linux Gitea runners use bash 4/5 so
 # production runs the real script. The mock exercises the success +
 # failure branches of refire's status-POST glue.
 run_scenario() {
  local scenario="$1"
  local tier_result="${2:-pass}"
  echo "$scenario" >"$FIX_STATE_DIR/scenario"
  : >"$FIX_STATE_DIR/posted_statuses.jsonl"  # clear status log
  local out
  set +e
  out=$(
    PATH="$FIXTURE_DIR/bin:$PATH" \
    GITEA_TOKEN="fixture-token" \
    GITEA_HOST="fixture.local" \
    REPO="molecule-ai/molecule-core" \
    PR_NUMBER="999" \
    COMMENT_AUTHOR="test-runner" \
    SOP_REFIRE_DISABLE_RATE_LIMIT="1" \
    SOP_REFIRE_TIER_CHECK_SCRIPT="$THIS_DIR/_mock_tier_check.sh" \
    MOCK_TIER_RESULT="$tier_result" \
    FIXTURE_PORT="$FIX_PORT" \
    bash "$SCRIPT" 2>&1
  )
  local rc=$?
  set -e
  echo "$out" >"$FIX_STATE_DIR/last_run.log"
  echo "$rc" >"$FIX_STATE_DIR/last_rc"
 }
 # Install a curl shim that rewrites https://fixture.local → http://127.0.0.1:$PORT
 # Use bash prefix-strip (${var#prefix}) — it sidesteps the `/` delimiter
 # confusion of ${var/pattern/replacement}.
 mkdir -p "$FIXTURE_DIR/bin"
 cat >"$FIXTURE_DIR/bin/curl" <<SHIM
 #!/usr/bin/env bash
 # Test shim: rewrite https://fixture.local/* -> http://127.0.0.1:${FIX_PORT}/*
 # The fixture doesn't authenticate; -H Authorization passes through harmlessly.
 new_args=()
 for a in "\$@"; do
  if [[ "\$a" == https://fixture.local/* ]]; then
    rest="\${a#https://fixture.local}"
    a="http://127.0.0.1:${FIX_PORT}\${rest}"
  fi
  new_args+=("\$a")
 done
 exec /usr/bin/curl "\${new_args[@]}"
 SHIM
 chmod +x "$FIXTURE_DIR/bin/curl"
 # T1: tier:low + 1 APPROVED + author is in engineers team → success
 run_scenario "T1_success" "pass"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 assert_eq "T1 exit code 0 (success)" "0" "$RC"
 assert_contains "T1 POSTed state=success" '"state": "success"' "$POSTED"
 assert_contains "T1 POST context is sop-tier-check / tier-check" \
  '"context": "sop-tier-check / tier-check (pull_request)"' "$POSTED"
 assert_contains "T1 description names commenter" "test-runner" "$POSTED"
 # T2: missing tier label → tier-check fails → failure status POSTed
 run_scenario "T2_no_tier_label" "fail_no_label"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 # tier-check.sh exits 1; refire script forwards that exit, so RC != 0
 if [ "$RC" -ne 0 ]; then
  echo "  PASS  T2 exit code non-zero (got $RC)"
  PASS=$((PASS + 1))
 else
  echo "  FAIL  T2 exit code should be non-zero, got 0"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} T2_rc"
 fi
 assert_contains "T2 POSTed state=failure" '"state": "failure"' "$POSTED"
 # T3: tier:low present but ZERO approving reviews → failure
 run_scenario "T3_no_approvals" "fail_no_approvals"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 if [ "$RC" -ne 0 ]; then
  echo "  PASS  T3 exit code non-zero (got $RC)"
  PASS=$((PASS + 1))
 else
  echo "  FAIL  T3 exit code should be non-zero, got 0"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} T3_rc"
 fi
 assert_contains "T3 POSTed state=failure" '"state": "failure"' "$POSTED"
 # T4: closed PR — refire is a no-op (no POST, exit 0)
 run_scenario "T4_closed" "pass"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 assert_eq "T4 closed PR exits 0" "0" "$RC"
 assert_eq "T4 closed PR posts no status" "" "$POSTED"
 # T5: rate-limit — disable the env override and let scenario set a
 # recent statuses entry. Re-enable rate-limit for this scenario by NOT
 # passing SOP_REFIRE_DISABLE_RATE_LIMIT.
 echo "T5_rate_limited" >"$FIX_STATE_DIR/scenario"
 : >"$FIX_STATE_DIR/posted_statuses.jsonl"
 set +e
 T5_OUT=$(
  PATH="$FIXTURE_DIR/bin:$PATH" \
  GITEA_TOKEN="fixture-token" \
  GITEA_HOST="fixture.local" \
  REPO="molecule-ai/molecule-core" \
  PR_NUMBER="999" \
  COMMENT_AUTHOR="test-runner" \
  FIXTURE_PORT="$FIX_PORT" \
  bash "$SCRIPT" 2>&1
 )
 T5_RC=$?
 set -e
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 assert_eq "T5 rate-limited exits 0" "0" "$T5_RC"
 assert_contains "T5 rate-limited log says skipped" "rate-limited" "$T5_OUT"
 assert_eq "T5 rate-limited posts no status" "" "$POSTED"
 echo
 echo "------"
 echo "PASS=$PASS FAIL=$FAIL"
 if [ "$FAIL" -gt 0 ]; then
  echo "Failed:$FAILED_TESTS"
 fi
 [ "$FAIL" -eq 0 ]
--- a/.gitea/sop-checklist-config.yaml
+++ b/.gitea/sop-checklist-config.yaml
@ -1,109 +0,0 @@
 # SOP-Checklist gate — per-item required reviewer teams.
 #
 # RFC#351 v1 starter set. Each item lists:
 #   slug              — canonical kebab-case form used in /sop-ack <slug>
 #   pr_section_marker — substring matched in the PR body to detect that
 #                       the author filled in this item (case-insensitive)
 #   required_teams    — list of Gitea team names; an ack from ANY one of
 #                       these teams (logical OR) satisfies the item.
 #                       Membership is probed at gate-time via
 #                       GET /api/v1/teams/{id}/members/{login}.
 #                       Team-id resolution happens at script start via
 #                       GET /api/v1/orgs/{org}/teams (cheap, one call).
 #   numeric_alias     — 1..7; lets reviewers type `/sop-ack 3` as a
 #                       shortcut for `/sop-ack staging-smoke`.
 #
 # WHY THESE TEAM MAPPINGS:
 #   The RFC table referenced persona-role names like `core-qa`,
 #   `core-be`, `core-devops` — these are individual Gitea user logins,
 #   not teams. The Gitea team-membership API is /teams/{id}/members/{u},
 #   so we need actual teams. Orchestrator preflight 2026-05-12 verified
 #   only these teams exist on molecule-ai: ceo(5), engineers(2),
 #   managers(6), qa(20), security(21), Owners(1), and bot teams. We
 #   map the RFC roles to the closest existing team and surface the
 #   mapping explicitly so it's reviewable.
 #
 # HOW TO EDIT:
 #   - Tightening: replace `engineers` with a smaller team after creating
 #     it (e.g. a new `senior-engineers` team if needed).
 #   - Loosening: add another team to required_teams (OR semantics).
 #   - Add an item: append to items list and document the slug below.
 #
 # AUTHOR SELF-ACK IS FORBIDDEN regardless of which team contains them
 # — the gate script enforces commenter != PR author before checking
 # team membership.
 version: 1
 # Tier-aware failure mode (RFC#351 open question 2):
 #   For tier:high — hard-fail (status `failure`, blocks merge via BP).
 #   For tier:medium — hard-fail (same as high; medium is non-trivial).
 #   For tier:low — soft-fail (status `pending` with `acked: N/M` in the
 #                  description). BP can choose to require the context
 #                  or not for low-tier PRs.
 # If no tier label is present, default to medium (hard-fail) — every PR
 # should have a tier label per sop-tier-check, and absence indicates
 # a missing-tier defect we should surface, not silently lower the bar.
 tier_failure_mode:
  "tier:high": hard
  "tier:medium": hard
  "tier:low": soft
 default_mode: hard  # used when no tier:* label is present
 items:
  - slug: comprehensive-testing
    numeric_alias: 1
    pr_section_marker: "Comprehensive testing performed"
    required_teams: [qa, engineers]
    description: >-
      What was tested, how, edge cases covered. Ack from any qa-team
      member (or engineers fallback while qa is small).
  - slug: local-postgres-e2e
    numeric_alias: 2
    pr_section_marker: "Local-postgres E2E run"
    required_teams: [engineers]
    description: >-
      Link to local CI artifact, or "N/A: pure-frontend change". Ack
      from any engineer who can verify the local DB test actually ran.
  - slug: staging-smoke
    numeric_alias: 3
    pr_section_marker: "Staging-smoke verified or pending"
    required_teams: [engineers]
    description: >-
      Link to canary run, or "scheduled post-merge". Ack from any
      engineer (core-devops/infra-sre are members of engineers team).
  - slug: root-cause
    numeric_alias: 4
    pr_section_marker: "Root-cause not symptom"
    required_teams: [managers, ceo]
    description: >-
      One-sentence root-cause statement. Ack from managers tier
      (team-leads) or ceo. Senior judgment required to attest
      root-cause-versus-symptom.
  - slug: five-axis-review
    numeric_alias: 5
    pr_section_marker: "Five-Axis review walked"
    required_teams: [engineers]
    description: >-
      Correctness / readability / architecture / security / performance.
      Ack from any non-author engineer.
  - slug: no-backwards-compat
    numeric_alias: 6
    pr_section_marker: "No backwards-compat shim / dead code added"
    required_teams: [managers, ceo]
    description: >-
      Yes/no + justification if no. Senior ack required because
      backward-compat shims are how dead-code accretes.
  - slug: memory-consulted
    numeric_alias: 7
    pr_section_marker: "Memory/saved-feedback consulted"
    required_teams: [engineers]
    description: >-
      List of feedback memories applicable to this change. Ack from
      any engineer who has the same memory access.
--- a/.gitea/workflows/audit-force-merge.yml
+++ b/.gitea/workflows/audit-force-merge.yml
@ -1,89 +0,0 @@
 # audit-force-merge — emit `incident.force_merge` to the runner log when
 # a PR is merged with required-status checks NOT all green. Vector picks
 # the JSON line off docker_logs and ships to Loki on
 # molecule-canonical-obs (per `reference_obs_stack_phase1`); query as:
 #
 #   {host="operator"} |= "event_type" |= "incident.force_merge" | json
 #
 # Companion to `audit-force-merge.sh` (script-extract pattern, same as
 # sop-tier-check). The audit observes BOTH UI-merged and REST-merged PRs
 # uniformly per `feedback_gh_cli_merge_lies_use_rest`.
 #
 # Closes the §SOP-6 audit gap for the molecule-core repo. RFC:
 # internal#219 §6. Mirrors the same-named workflow in
 # molecule-controlplane; design rationale lives in the RFC, not here,
 # to keep the workflow file scannable.
 name: audit-force-merge
 # pull_request_target loads from the base branch — same security model
 # as sop-tier-check. Without this, a PR author could rewrite the
 # workflow on their own PR and skip the audit emission for their own
 # force-merge. The base-branch checkout below ALSO uses
 # `base.sha`, not `base.ref`, so a fast-moving base can't slip a
 # different audit script in under us.
 on:
  pull_request_target:
    types: [closed]
 # `pull-requests: read` + `contents: read` covers everything the script
 # needs (fetch PR + commit statuses). `issues:` deliberately omitted —
 # audit fires-and-forgets to stdout, never opens issues.
 permissions:
  contents: read
  pull-requests: read
 jobs:
  audit:
    runs-on: ubuntu-latest
    # Skip when PR is closed without merge — saves a runner.
    if: github.event.pull_request.merged == true
    steps:
      - name: Check out base branch (for the script)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # base.sha pinning, NOT base.ref — see header rationale.
          ref: ${{ github.event.pull_request.base.sha }}
      - name: Detect force-merge + emit audit event
        env:
          # Same org-level secret the sop-tier-check workflow uses;
          # falls back to the auto-injected GITHUB_TOKEN if the
          # org-level SOP_TIER_CHECK_TOKEN isn't set on a transitional
          # repo.
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          # Required-status-check contexts to evaluate at merge time.
          # Newline-separated. MUST mirror branch protection's
          # status_check_contexts for protected branches
          # (currently `main`; `staging` protection forthcoming per
          # RFC internal#219 Phase 4).
          #
          # Initialized 2026-05-11 from the current molecule-core `main`
          # branch protection:
          #
          #   GET /api/v1/repos/molecule-ai/molecule-core/
          #       branch_protections/main
          #   → status_check_contexts = [
          #       "Secret scan / Scan diff for credential-shaped strings (pull_request)",
          #       "sop-tier-check / tier-check (pull_request)"
          #     ]
          #
          # Declared here rather than fetched from /branch_protections
          # because that endpoint requires admin write — sop-tier-bot
          # is read-only by design (least-privilege per
          # `feedback_least_privilege_via_workflow_env` / internal#257).
          # Drift between this env and the real protection list is
          # auto-detected by `ci-required-drift.yml` (RFC §4 + §6),
          # which opens a `[ci-drift]` issue within one hour.
          #
          # When the protection set changes (e.g. Phase 4 adds the
          # `ci / all-required (pull_request)` sentinel), update BOTH
          # branch protection AND this env in the SAME PR; drift-detect
          # will otherwise file an issue for you.
          REQUIRED_CHECKS: |
            Secret scan / Scan diff for credential-shaped strings (pull_request)
            sop-tier-check / tier-check (pull_request)
            CI / all-required (pull_request)
        run: bash .gitea/scripts/audit-force-merge.sh
--- a/.gitea/workflows/block-internal-paths.yml
+++ b/.gitea/workflows/block-internal-paths.yml
@ -1,148 +0,0 @@
 name: Block internal-flavored paths
 # Ported from .github/workflows/block-internal-paths.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group: { types: [checks_requested] }` (Gitea has no
 #     merge queue; no `gh-readonly-queue/...` refs).
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on the job (RFC §1 contract — surface
 #     defects without blocking; follow-up PR flips after triage).
 #
 # Hard CI gate. Internal content (positioning, competitive briefs, sales
 # playbooks, PMM/press drip, draft campaigns) lives in molecule-ai/internal —
 # this public monorepo must never re-acquire those paths. CEO directive
 # 2026-04-23 after a fleet-wide audit found 79 internal files leaked here.
 #
 # Failure mode without this gate: agents (PMM, Research, DevRel, Sales) drop
 # briefs into the easiest path their cwd resolves to (root /research,
 # /marketing, /docs/marketing) and gitignore alone won't catch a `git add -f`
 # or a stale gitignore line. This workflow is the mechanical backstop.
 on:
  pull_request:
    types: [opened, synchronize, reopened]
  push:
    branches: [main, staging]
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  check:
    name: Block forbidden paths
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 2  # need previous commit to diff against on push events
      # For pull_request events the diff base is github.event.pull_request.base.sha,
      # which may be many commits behind HEAD and therefore absent from the
      # shallow clone above. Fetch it explicitly (depth=1 keeps it fast).
      - name: Fetch PR base SHA (pull_request events only)
        if: github.event_name == 'pull_request'
        run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
      - name: Refuse if forbidden paths appear
        env:
          # Plumb event-specific SHAs through env so the script doesn't
          # need conditional `${{ ... }}` interpolation per event type.
          # github.event.before/after only exist on push events;
          # pull_request has pull_request.base.sha / pull_request.head.sha.
          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
          PUSH_BEFORE: ${{ github.event.before }}
          PUSH_AFTER: ${{ github.event.after }}
        run: |
          # Paths that must NEVER live in the public monorepo. Add to this
          # list narrowly — broader patterns belong in .gitignore so day-to-day
          # docs work isn't accidentally blocked.
          FORBIDDEN_PATTERNS=(
            "^research/"
            "^marketing/"
            "^docs/marketing/"
            "^comment-[0-9]+\.json$"
            "^test-pmm.*\.(txt|md)$"
            "^tick-reflections.*\.(txt|md)$"
            ".*-temp\.(md|txt)$"
          )
          # Determine the diff base. Each event type stores its SHAs in
          # a different place — see the env block above.
          case "${{ github.event_name }}" in
            pull_request)
              BASE="$PR_BASE_SHA"
              HEAD="$PR_HEAD_SHA"
              ;;
            *)
              BASE="$PUSH_BEFORE"
              HEAD="$PUSH_AFTER"
              ;;
          esac
          # On push events with shallow clones, BASE may be present in
          # the event payload but absent from the local object DB
          # (fetch-depth=2 doesn't always reach the previous commit
          # across true merges). Try fetching it on demand. If the
          # fetch fails — e.g. the SHA was force-overwritten — we fall
          # through to the empty-BASE branch below, which scans the
          # entire tree as if every file were new. Correct, just slow.
          if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
            if ! git cat-file -e "$BASE" 2>/dev/null; then
              git fetch --depth=1 origin "$BASE" 2>/dev/null || true
            fi
          fi
          # Files added or modified in this change.
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
            # New branch / no previous SHA / BASE unreachable — check
            # the entire tree as if every file were new. Slower but
            # correct on first push or post-fetch-failure recovery.
            CHANGED=$(git ls-tree -r --name-only HEAD)
          else
            CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
          fi
          if [ -z "$CHANGED" ]; then
            echo "No changed files to inspect."
            exit 0
          fi
          OFFENDING=""
          for path in $CHANGED; do
            for pattern in "${FORBIDDEN_PATTERNS[@]}"; do
              if echo "$path" | grep -qE "$pattern"; then
                OFFENDING="${OFFENDING}${path} (matched: ${pattern})\n"
                break
              fi
            done
          done
          if [ -n "$OFFENDING" ]; then
            echo "::error::Forbidden internal-flavored paths detected:"
            printf "$OFFENDING"
            echo ""
            echo "These paths belong in molecule-ai/internal, not this public repo."
            echo "See docs/internal-content-policy.md for canonical locations."
            echo ""
            echo "If your file is genuinely public-facing (e.g. a blog post"
            echo "ready to ship), use one of these alternatives instead:"
            echo "  - Public-bound blog posts:  docs/blog/<slug>.md"
            echo "  - Public-bound tutorials:   docs/tutorials/<slug>.md"
            echo "  - Public devrel content:    docs/devrel/<slug>.md"
            echo ""
            echo "If you legitimately need to add a new top-level path that"
            echo "happens to match a forbidden pattern, edit"
            echo ".gitea/workflows/block-internal-paths.yml and update the"
            echo "FORBIDDEN_PATTERNS list with reviewer signoff."
            exit 1
          fi
          echo "OK No forbidden paths in this change."
--- a/.gitea/workflows/cascade-list-drift-gate.yml
+++ b/.gitea/workflows/cascade-list-drift-gate.yml
@ -1,58 +0,0 @@
 name: cascade-list-drift-gate
 # Ported from .github/workflows/cascade-list-drift-gate.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths reference .gitea/workflows/publish-runtime.yml (the active
 #     Gitea workflow file) instead of .github/workflows/publish-runtime.yml
 #     (which Category A of this sweep deletes).
 #   - Explicit `WORKFLOW=` arg passed to the drift script so it audits the
 #     .gitea/ workflow (the script's default is still .github/... which
 #     will not exist post-Cat-A).
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on the job (RFC §1 contract — surface
 #     defects without blocking; follow-up PR flips after triage).
 #
 # Structural gate: TEMPLATES list in publish-runtime.yml must match
 # manifest.json's workspace_templates exactly. Closes the recurrence
 # path of PR #2556 (the data fix) and is the first concrete deliverable
 # of RFC #388 PR-3.
 #
 # Triggers narrowly to keep CI quiet: only on PRs that actually change
 # one of the two files. The path-filtered split + always-emit-result
 # pattern (memory: "Required check names need a job that always runs")
 # is unnecessary here because the workflow IS the check name and PR
 # branch protection should require it directly. Future-proof: if this
 # becomes a required check, add a no-op aggregator with always() so the
 # name still emits when paths don't match.
 on:
  pull_request:
    branches: [staging, main]
    paths:
      - manifest.json
      - .gitea/workflows/publish-runtime.yml
      - scripts/check-cascade-list-vs-manifest.sh
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 permissions:
  contents: read
 jobs:
  check:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
      - name: Check cascade list matches manifest
        # Pass the .gitea/ workflow path explicitly — the script's
        # default still points at .github/... which Category A of this
        # sweep removes.
        run: bash scripts/check-cascade-list-vs-manifest.sh manifest.json .gitea/workflows/publish-runtime.yml
--- a/.gitea/workflows/check-migration-collisions.yml
+++ b/.gitea/workflows/check-migration-collisions.yml
@ -1,74 +0,0 @@
 name: Check migration collisions
 # Ported from .github/workflows/check-migration-collisions.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths includes .gitea/workflows/check-migration-collisions.yml
 #     (this file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL pinned to https://git.moleculesai.app
 #     so scripts/ops/check_migration_collisions.py can derive the Gitea API
 #     base (the script already supports this; see _gitea_api_url()).
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Hard gate (#2341): fails a PR that adds a migration prefix already
 # claimed by the base branch or another open PR. Caught manually 2026-04-30
 # during PR #2276 rebase: 044_runtime_image_pins collided with
 # 044_platform_inbound_secret from RFC #2312. This workflow makes that
 # check automatic.
 #
 # Trigger model: pull_request only — there's no value running this on
 # pushes to staging or main (those are post-merge; the gate must fire
 # pre-merge to be useful). Path filter scopes to PRs that actually touch
 # migrations.
 on:
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
      - 'workspace-server/migrations/**'
      - 'scripts/ops/check_migration_collisions.py'
      - '.gitea/workflows/check-migration-collisions.yml'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 permissions:
  contents: read
  # API needs read access to other PRs to detect cross-PR collisions
  pull-requests: read
 jobs:
  check:
    name: Migration version collision check
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          # Need history to diff against base ref
          fetch-depth: 0
      - name: Detect collisions
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
          BASE_REF: origin/${{ github.event.pull_request.base.ref }}
          HEAD_REF: ${{ github.event.pull_request.head.sha }}
          GITHUB_REPOSITORY: ${{ github.repository }}
          # Auto-injected; Gitea aliases this for in-repo API access.
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          # Ensure the named base ref exists locally. checkout@v4 with
          # fetch-depth=0 pulls full history, but the explicit fetch is
          # cheap insurance against form-of-ref differences across runs.
          #
          # IMPORTANT: do NOT pass --depth=1 here. The script below uses
          # `git diff origin/<base>...<head>` (three-dot, merge-base form),
          # which fails with "fatal: no merge base" if the base ref is
          # shallow.
          git fetch origin "${{ github.event.pull_request.base.ref }}" || true
          python3 scripts/ops/check_migration_collisions.py
--- a/.gitea/workflows/ci-required-drift.yml
+++ b/.gitea/workflows/ci-required-drift.yml
@ -1,112 +0,0 @@
 # ci-required-drift — hourly sentinel for drift between the canonical
 # "what counts as required" sources of truth in this repo:
 #
 #   1. `.gitea/workflows/ci.yml` jobs                       (CI source)
 #   2. `branch_protections/{main,staging}.status_check_contexts`
 #                                                           (protection)
 #   3. `.gitea/workflows/audit-force-merge.yml` REQUIRED_CHECKS env
 #                                                           (audit env)
 #
 # RFC: internal#219 §4 (jobs ↔ protection) + §6 (audit env ↔ protection).
 # Ported verbatim-then-adapted from molecule-controlplane PR#112
 # (SHA 0adf2098) per RFC internal#219 Phase 2b+c — replicate repo-by-repo.
 #
 # When any pair diverges, a `[ci-drift]` issue is opened or updated
 # (idempotent by title) and labelled `tier:high`. This is the
 # auto-detection that closes the regression class identified in
 # RFC §1 finding 3 (protection only listed 2 of 6 real jobs for
 # ~weeks, undetected) and §6 (audit env drifts silently from
 # protection).
 #
 # Diff logic lives in `.gitea/scripts/ci-required-drift.py`. The
 # Python file does YAML AST parsing + `needs:` graph walking per
 # `feedback_behavior_based_ast_gates` — NOT grep-by-name. That way
 # job renames or matrix-expansion-induced churn produce honest signal.
 #
 # NOTE on protection endpoint scope: `GET /repos/.../branch_protections/{branch}`
 # requires repo-admin role in Gitea 1.22.6. If DRIFT_BOT_TOKEN lacks it,
 # the script skips that branch with a clear ::error:: diagnostic and exits 0
 # (the issue IS the alarm, not a red workflow). See provisioning trail in
 # the run step's GITEA_TOKEN env comment.
 name: ci-required-drift
 # IMPORTANT — Gitea 1.22.6 parser quirk per
 # `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
 # `inputs:` block here, even though stock GitHub Actions allows it.
 # Gitea 1.22.6 flattens `workflow_dispatch.inputs.X` into a sibling of
 # the `on:` event keys and rejects the entire workflow as
 # "unknown on type". The whole file then registers for ZERO events
 # (no schedule, no dispatch). When Gitea ≥ 1.23 lands fleet-wide,
 # this constraint can be revisited.
 on:
  schedule:
    # Hourly at :17 — offset from :00 to spread load away from the
    # peak when N cron workflows fire on the hour-boundary, per
    # RFC §4 cadence ("off-zero").
    - cron: '17 * * * *'
  workflow_dispatch:
 # Read protection + read CI YAML + write issue. No write on contents.
 permissions:
  contents: read
  issues: write
 # Serialise — two simultaneous drift runs would duel on the issue
 # create/update path. The audit is idempotent, but parallel POSTs
 # can produce duplicate comments before the title-search dedup wins.
 concurrency:
  group: ci-required-drift
  cancel-in-progress: false
 jobs:
  drift:
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - name: Check out repo (we read the YAML files locally)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - name: Set up Python (PyYAML for AST parsing)
        # Avoid a system-pip install on the runner; setup-python pins
        # a hermetic interpreter + cache. PyYAML is small enough that
        # the install is sub-2s — no need to cache wheels.
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Install PyYAML
        run: python -m pip install --quiet 'PyYAML==6.0.2'
      - name: Run drift detector
        env:
          # DRIFT_BOT_TOKEN is owned by mc-drift-bot, a least-privilege
          # Gitea persona whose ONLY job is reading branch_protections
          # and posting the [ci-drift] tracking issue. The endpoint
          # `GET /repos/.../branch_protections/{branch}` requires
          # repo-ADMIN role (Gitea 1.22.6) — SOP_TIER_CHECK_TOKEN and the
          # auto-injected GITHUB_TOKEN do NOT have it (read-only / write
          # without admin), so the previous fallback chain 403'd.
          # Mirrors the controlplane fix landed in CP PR#134.
          # Provisioning trail: internal#329 (audit) + parent pattern
          # internal#327 (publish-runtime-bot). Per
          # `feedback_per_agent_gitea_identity_default`.
          GITEA_TOKEN: ${{ secrets.DRIFT_BOT_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          # Branches whose protection we compare against. molecule-core
          # currently has main protected; staging protection is
          # forthcoming. Keep this list in sync if a new long-lived
          # branch gets protected (e.g. release/* if introduced later).
          BRANCHES: 'main staging'
          # The sentinel job's name inside ci.yml. If the aggregator
          # is ever renamed, update this too (the drift detector
          # currently treats `all-required` as the source of "what
          # the sentinel claims to require").
          SENTINEL_JOB: 'all-required'
          # Path to the audit workflow whose REQUIRED_CHECKS env we
          # cross-check against protection (RFC §6).
          AUDIT_WORKFLOW_PATH: '.gitea/workflows/audit-force-merge.yml'
          # Path to the CI workflow with the sentinel + the jobs.
          CI_WORKFLOW_PATH: '.gitea/workflows/ci.yml'
          # Issue label applied on file/update. `tier:high` exists in
          # the molecule-core label set (verified 2026-05-11, label id 9).
          DRIFT_LABEL: 'tier:high'
        run: python3 .gitea/scripts/ci-required-drift.py
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@ -1,580 +0,0 @@
 # Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1.
 # continue-on-error: true on every job; follow-up PR will flip required after
 # surfaced bugs are fixed (per RFC §1 — "surface broken workflows without
 # blocking"). The four-surface migration audit
 # (feedback_gitea_actions_migration_audit_pattern) was performed against this
 # port:
 #
 #   1. YAML — dropped `merge_group` trigger (no Gitea merge queue); no
 #      `workflow_dispatch.inputs` to drop (Gitea 1.22.6 rejects those —
 #      feedback_gitea_workflow_dispatch_inputs_unsupported); no `environment:`
 #      blocks; kept `runs-on: ubuntu-latest` (Gitea runner pool advertises
 #      this label per agent_labels in action_runner table). Workflow-level
 #      env.GITHUB_SERVER_URL set as belt-and-suspenders against runner
 #      defaults (feedback_act_runner_github_server_url).
 #
 #   2. Cache — `actions/upload-artifact@v3.2.2` was already pinned to v3 for
 #      Gitea act_runner v0.6 compatibility (a comment in the original called
 #      this out). v4+ is incompatible with Gitea 1.22.x. No `actions/cache`
 #      usage to audit. `actions/setup-python@v6` `cache: pip` is left in
 #      place — works against Gitea's built-in cache server when runner.cache
 #      is configured (currently is, /opt/molecule/runners/config.yaml).
 #
 #   3. Token — workflow uses no custom dispatch tokens. The auto-injected
 #      `GITHUB_TOKEN` (which Gitea aliases to a runner-scoped token) is
 #      sufficient for `actions/checkout` against this same repo.
 #
 #   4. Docs — no docs/scripts reference github.com URLs that need swapping.
 #      The canvas-deploy-reminder step writes a `ghcr.io/...` image
 #      reference into the step summary text — that's documentation prose
 #      pointing at the ECR-mirrored canvas image and stays unchanged for
 #      this port (a separate cleanup if ghcr→ECR sweep is in scope).
 #
 # Cross-links:
 #   - RFC: internal#219 (CI/CD hard-gate hardening)
 #   - Reference port style: molecule-controlplane/.gitea/workflows/ci.yml
 #   - Bugs that may surface immediately and are tracked separately:
 #     internal#214 (Go-side vanity-import / go.sum drift, if any)
 #   - Phase 4 (this PR's follow-up): flip `continue-on-error: false` once
 #     surfaced defects are fixed, then add `all-required` aggregator
 #     sentinel (RFC §2) and PATCH branch protection (Phase 4 scope).
 name: CI
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
  # `merge_group` (GitHub merge-queue trigger) dropped — Gitea has no merge
  # queue. The .github/ original retains it; this Gitea-side copy drops it.
 # Cancel in-progress CI runs when a new commit arrives on the same ref.
 # Stale runs queue up otherwise. PR refs and main/staging refs each get
 # their own group because github.ref differs.
 concurrency:
  group: ci-${{ github.ref }}
  cancel-in-progress: true
 env:
  # Belt-and-suspenders against the runner-default trap
  # (feedback_act_runner_github_server_url). Runners are configured with
  # this env via /opt/molecule/runners/config.yaml runner.envs, but pinning
  # at the workflow level protects against a runner regenerated without
  # the config file (feedback_act_runner_needs_config_file_env).
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  # Detect which paths changed so downstream jobs can skip when only
  # docs/markdown files were modified.
  changes:
    name: Detect changes
    runs-on: ubuntu-latest
    # Phase 4 (RFC #219 §1): all required jobs >=98% green on main.
    # Flip confirmed 2026-05-12 via combined-status check of latest main
    # commit (all CI jobs green). `all-required` sentinel hard-fails
    # when this job fails; no Phase 3 suppression needed.
    # revert: add `continue-on-error: true` back if regressions appear.
    continue-on-error: false
    outputs:
      platform: ${{ steps.check.outputs.platform }}
      canvas: ${{ steps.check.outputs.canvas }}
      python: ${{ steps.check.outputs.python }}
      scripts: ${{ steps.check.outputs.scripts }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: check
        run: |
          # For PR events: diff against the base branch (not HEAD~1 of the branch,
          # which may be unrelated after force-pushes). When a push updates a PR,
          # both pull_request and push events fire — prefer the PR base so that
          # the diff is always computed against the actual merge base, not the
          # previous SHA on the branch which may be on a different history line.
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          # GITHUB_BASE_REF is set for PR events (the base branch name).
          # For pull_request events we use the stored base.sha; for push events
          # (or when base.sha is unavailable) fall back to github.event.before.
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          # Fallback: if BASE is empty or all zeros (new branch), run everything
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "platform=true" >> "$GITHUB_OUTPUT"
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            echo "python=true" >> "$GITHUB_OUTPUT"
            echo "scripts=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Both .github/workflows/ci.yml AND .gitea/workflows/ci.yml count
          # as "this workflow changed" — either edit should force-run every
          # downstream job. The Gitea port follows the same shape as the
          # GitHub original so behavior matches when triggered on either
          # platform.
          DIFF=$(git diff --name-only "$BASE" HEAD 2>/dev/null || echo ".gitea/workflows/ci.yml")
          echo "platform=$(echo "$DIFF" | grep -qE '^workspace-server/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
          echo "canvas=$(echo "$DIFF" | grep -qE '^canvas/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
          echo "python=$(echo "$DIFF" | grep -qE '^workspace/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
          echo "scripts=$(echo "$DIFF" | grep -qE '^tests/e2e/|^scripts/|^infra/scripts/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
  # Platform (Go) — Go build/vet/test/lint + coverage gates. The always-run
  # + per-step gating shape preserves the GitHub-side required-check name
  # contract (so when this Gitea port becomes a required check in Phase 4,
  # the name match works on PRs that don't touch workspace-server/).
  platform-build:
    name: Platform (Go)
    needs: changes
    runs-on: ubuntu-latest
    # mc#664 (interim): re-mask platform-build pending fix-forward. Phase 4
    # (#656) flipped this to continue-on-error: false based on a Phase-3-masked
    # "green on main 2026-05-12" — the prior continue-on-error: true had
    # been hiding failing tests in workspace-server/internal/handlers/.
    # Two distinct failure classes surfaced on 0e5152c3:
    #   (1) 4x delegation_test.go (lines 1110/1176/1228/1271): helpers
    #       expectExecuteDelegationBase/Success/Failed are missing sqlmock
    #       expectations for queries production has issued since ~2026-04-21
    #       (last_outbound_at UPDATE, lookupDeliveryMode/Runtime SELECTs,
    #       a2a_receive INSERT activity_logs, recordLedgerStatus writes).
    #       Halt cond #3 applies (regression > 7 days → broader sweep).
    #   (2) 1x mcp_test.go:433 (TestMCPHandler_CommitMemory_GlobalScope_Blocked):
    #       commit 7d1a189f (2026-05-10) hardened mcp.go to scrub err.Error()
    #       from JSON-RPC responses (OFFSEC-001), but the test asserts the
    #       error message contains "GLOBAL". Production-vs-test contract
    #       collision — needs design call, not mock update.
    # Time-boxed Option A (90 min) did not fit the cross-cutting scope.
    # This is a sequenced revert→fix→reflip per
    # feedback_strict_root_only_after_class_a emergency clause — NOT
    # a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing.
    # Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint)
    # retain continue-on-error: false; only platform-build regresses.
    continue-on-error: true  # mc#664 fix-forward in flight; re-flip when tests pass
    defaults:
      run:
        working-directory: workspace-server
    steps:
      - if: needs.changes.outputs.platform != 'true'
        working-directory: .
        run: echo "No platform/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.platform == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.platform == 'true'
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
      - if: needs.changes.outputs.platform == 'true'
        run: go mod download
      - if: needs.changes.outputs.platform == 'true'
        run: go build ./cmd/server
      # CLI (molecli) moved to standalone repo: git.moleculesai.app/molecule-ai/molecule-cli
      - if: needs.changes.outputs.platform == 'true'
        run: go vet ./... || true
      - if: needs.changes.outputs.platform == 'true'
        name: Run golangci-lint
        run: golangci-lint run --timeout 3m ./... || true
      - if: needs.changes.outputs.platform == 'true'
        name: Diagnostic — per-package verbose 60s
        run: |
          set +e
          go test -race -v -timeout 60s ./internal/handlers/... 2>&1 | tee /tmp/test-handlers.log
          handlers_exit=$?
          go test -race -v -timeout 60s ./internal/pendinguploads/... 2>&1 | tee /tmp/test-pu.log
          pu_exit=$?
          echo "::group::handlers exit=$handlers_exit (last 100 lines)"
          tail -100 /tmp/test-handlers.log
          echo "::endgroup::"
          echo "::group::pendinguploads exit=$pu_exit (last 100 lines)"
          tail -100 /tmp/test-pu.log
          echo "::endgroup::"
        continue-on-error: true
      - if: needs.changes.outputs.platform == 'true'
        name: Run tests with race detection and coverage
        run: go test -race -coverprofile=coverage.out ./...
      - if: needs.changes.outputs.platform == 'true'
        name: Per-file coverage report
        # Advisory — lists every source file with its coverage so reviewers
        # can see at-a-glance where gaps are. Sorted ascending so the worst
        # offenders float to the top. Does NOT fail the build; the hard
        # gate is the threshold check below. (#1823)
        run: |
          echo "=== Per-file coverage (worst first) ==="
          go tool cover -func=coverage.out \
            | grep -v '^total:' \
            | awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
                   END {for (f in s) printf "%6.1f%%  %s\n", s[f]/c[f], f}' \
            | sort -n
      - if: needs.changes.outputs.platform == 'true'
        name: Check coverage thresholds
        # Enforces two gates from #1823 Layer 1:
        #   1. Total floor (25% — ratchet plan in COVERAGE_FLOOR.md).
        #   2. Per-file floor — non-test .go files in security-critical
        #      paths with coverage <10% fail the build, UNLESS the file
        #      path is listed in .coverage-allowlist.txt (acknowledged
        #      historical debt with a tracking issue + expiry).
        run: |
          set -e
          TOTAL_FLOOR=25
          # Security-critical paths where a 0%-coverage file is a real risk.
          CRITICAL_PATHS=(
            "internal/handlers/tokens"
            "internal/handlers/workspace_provision"
            "internal/handlers/a2a_proxy"
            "internal/handlers/registry"
            "internal/handlers/secrets"
            "internal/middleware/wsauth"
            "internal/crypto"
          )
          TOTAL=$(go tool cover -func=coverage.out | grep '^total:' | awk '{print $3}' | sed 's/%//')
          echo "Total coverage: ${TOTAL}%"
          if awk "BEGIN{exit !($TOTAL < $TOTAL_FLOOR)}"; then
            echo "::error::Total coverage ${TOTAL}% is below the ${TOTAL_FLOOR}% floor. See COVERAGE_FLOOR.md for ratchet plan."
            exit 1
          fi
          # Aggregate per-file coverage → /tmp/perfile.txt: "<fullpath> <pct>"
          go tool cover -func=coverage.out \
            | grep -v '^total:' \
            | awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
                   END {for (f in s) printf "%s %.1f\n", f, s[f]/c[f]}' \
            > /tmp/perfile.txt
          # Build allowlist — paths relative to workspace-server, one per line.
          # Lines starting with # are comments.
          ALLOWLIST=""
          if [ -f ../.coverage-allowlist.txt ]; then
            ALLOWLIST=$(grep -vE '^(#|[[:space:]]*$)' ../.coverage-allowlist.txt || true)
          fi
          FAILED=0
          WARNED=0
          for path in "${CRITICAL_PATHS[@]}"; do
            while read -r file pct; do
              [[ "$file" == *_test.go ]] && continue
              [[ "$file" == *"$path"* ]] || continue
              awk "BEGIN{exit !($pct < 10)}" || continue
              # Strip the package-import prefix so we can match .coverage-allowlist.txt
              # entries written as paths relative to workspace-server/.
              # Handle both module paths: platform/workspace-server/... and platform/...
              rel=$(echo "$file" | sed 's|^github.com/molecule-ai/molecule-monorepo/platform/workspace-server/||; s|^github.com/molecule-ai/molecule-monorepo/platform/||')
              if echo "$ALLOWLIST" | grep -qxF "$rel"; then
                echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry."
                WARNED=$((WARNED+1))
              else
                echo "::error file=workspace-server/$rel::Critical file at ${pct}% coverage — must be >=10% (target 80%). See #1823. To acknowledge as known debt, add this path to .coverage-allowlist.txt."
                FAILED=$((FAILED+1))
              fi
            done < /tmp/perfile.txt
          done
          echo ""
          echo "Critical-path check: $FAILED new failures, $WARNED allowlisted warnings."
          if [ "$FAILED" -gt 0 ]; then
            echo ""
            echo "$FAILED security-critical file(s) have <10% test coverage and are"
            echo "NOT in the allowlist. These paths handle auth, tokens, secrets, or"
            echo "workspace provisioning — a 0% file here is the exact gap that let"
            echo "CWE-22, CWE-78, KI-005 slip through in past incidents. Either:"
            echo "  (a) add tests to raise coverage above 10%, or"
            echo "  (b) add the path to .coverage-allowlist.txt with an expiry date"
            echo "      and a tracking issue reference."
            exit 1
          fi
  # Canvas (Next.js) — required check, always runs. Same always-run +
  # per-step gating shape as platform-build. The two-job-sharing-name
  # pattern attempted in PR #2321 doesn't satisfy branch protection
  # (SKIPPED siblings count as not-passed regardless of SUCCESS
  # siblings — verified empirically on PR #2314).
  canvas-build:
    name: Canvas (Next.js)
    needs: changes
    runs-on: ubuntu-latest
    # Phase 4 (RFC #219 §1): confirmed green on main 2026-05-12.
    continue-on-error: false
    defaults:
      run:
        working-directory: canvas
    steps:
      - if: needs.changes.outputs.canvas != 'true'
        working-directory: .
        run: echo "No canvas/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.canvas == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.canvas == 'true'
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '22'
      - if: needs.changes.outputs.canvas == 'true'
        run: rm -f package-lock.json && npm install
      - if: needs.changes.outputs.canvas == 'true'
        run: npm run build
      - if: needs.changes.outputs.canvas == 'true'
        name: Run tests with coverage
        # Coverage instrumentation is configured in canvas/vitest.config.ts
        # (provider: v8, reporters: text + html + json-summary). Step 2 of
        # #1815 — wires coverage into CI so we get a baseline visible on
        # every PR. No threshold gate yet; thresholds dial in (Step 3, also
        # tracked in #1815) after the team sees what current coverage is.
        run: npx vitest run --coverage
      - name: Upload coverage summary as artifact
        if: needs.changes.outputs.canvas == 'true' && always()
        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
        # implement, surfacing as `GHESNotSupportedError: @actions/artifact
        # v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not
        # currently supported on GHES`. Drop this pin when Gitea ships
        # the v4 protocol (tracked: post-Gitea-1.23 followup).
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: canvas-coverage-${{ github.run_id }}
          path: canvas/coverage/
          retention-days: 7
          if-no-files-found: warn
  # Shellcheck (E2E scripts) — required check, always runs.
  shellcheck:
    name: Shellcheck (E2E scripts)
    needs: changes
    runs-on: ubuntu-latest
    # Phase 4 (RFC #219 §1): confirmed green on main 2026-05-12.
    continue-on-error: false
    steps:
      - if: needs.changes.outputs.scripts != 'true'
        run: echo "No tests/e2e/ or infra/scripts/ changes — skipping real shellcheck; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.scripts == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.scripts == 'true'
        name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh
        # shellcheck is pre-installed on ubuntu-latest runners (via apt).
        # infra/scripts/ is included because setup.sh + nuke.sh gate the
        # README quickstart — a shellcheck regression there silently breaks
        # new-user onboarding. scripts/ is intentionally excluded until its
        # pre-existing SC3040/SC3043 warnings are cleaned up.
        run: |
          find tests/e2e infra/scripts -type f -name '*.sh' -print0 \
            | xargs -0 shellcheck --severity=warning
      - if: needs.changes.outputs.scripts == 'true'
        name: Lint cleanup-trap hygiene (RFC #2873)
        run: bash tests/e2e/lint_cleanup_traps.sh
      - if: needs.changes.outputs.scripts == 'true'
        name: Run E2E bash unit tests (no live infra)
        run: |
          bash tests/e2e/test_model_slug.sh
  canvas-deploy-reminder:
    name: Canvas Deploy Reminder
    runs-on: ubuntu-latest
    continue-on-error: true
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
    if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
    steps:
      - name: Write deploy reminder to step summary
        env:
          COMMIT_SHA: ${{ github.sha }}
          # github.server_url resolves via the workflow-level env override
          # to the Gitea instance, so the RUN_URL points at the Gitea run
          # page (not github.com). See feedback_act_runner_github_server_url.
          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
        run: |
          # Write body to a temp file — avoids backtick escaping in shell.
          cat > /tmp/deploy-reminder.md << 'BODY'
          ## Canvas build passed — deploy required
          The `publish-canvas-image` workflow is now building a fresh Docker image
          (`ghcr.io/molecule-ai/canvas:latest`) in the background.
          Once it completes (~3–5 min), apply on the host machine with:
          ```bash
          cd <runner-workspace>
          git pull origin main
          docker compose pull canvas && docker compose up -d canvas
          ```
          If you need to rebuild from local source instead (e.g. testing unreleased
          changes or a new `NEXT_PUBLIC_*` URL), use:
          ```bash
          docker compose build canvas && docker compose up -d canvas
          ```
          BODY
          printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
            "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
          # Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY,
          # which both GitHub Actions and Gitea Actions render as the
          # workflow run's summary page. (#75 / PR-D)
          cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
  # Python Lint & Test — required check, always runs.
  python-lint:
    name: Python Lint & Test
    needs: changes
    runs-on: ubuntu-latest
    # Phase 4 (RFC #219 §1): confirmed green on main 2026-05-12.
    continue-on-error: false
    env:
      WORKSPACE_ID: test
    defaults:
      run:
        working-directory: workspace
    steps:
      - if: needs.changes.outputs.python != 'true'
        working-directory: .
        run: echo "No workspace/** changes — skipping real lint+test; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.python == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.python == 'true'
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - if: needs.changes.outputs.python == 'true'
        run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0
      # Coverage flags + fail-under floor moved into workspace/pytest.ini
      # (issue #1817) so local `pytest` and CI use identical config.
      - if: needs.changes.outputs.python == 'true'
        run: python -m pytest --tb=short
      - if: needs.changes.outputs.python == 'true'
        name: Per-file critical-path coverage (MCP / inbox / auth)
        # MCP-critical Python files have a per-file floor on top of the
        # 86% total floor in pytest.ini. See issue #2790 for full rationale.
        run: |
          set -e
          PER_FILE_FLOOR=75
          CRITICAL_FILES=(
            "a2a_mcp_server.py"
            "mcp_cli.py"
            "a2a_tools.py"
            "a2a_tools_inbox.py"
            "inbox.py"
            "platform_auth.py"
          )
          # pytest already wrote .coverage; emit a JSON view scoped to
          # the critical files so jq/python can read the per-file pct
          # without parsing tabular text.
          INCLUDES=$(printf '*%s,' "${CRITICAL_FILES[@]}")
          INCLUDES="${INCLUDES%,}"
          python -m coverage json -o /tmp/critical-cov.json --include="$INCLUDES"
          FAILED=0
          for f in "${CRITICAL_FILES[@]}"; do
            pct=$(jq -r --arg f "$f" '.files | to_entries | map(select(.key == $f)) | .[0].value.summary.percent_covered // "MISSING"' /tmp/critical-cov.json)
            if [ "$pct" = "MISSING" ]; then
              echo "::error file=workspace/$f::No coverage data — file may have moved or test exclusion mis-set."
              FAILED=$((FAILED+1))
              continue
            fi
            echo "$f: ${pct}%"
            if awk "BEGIN{exit !($pct < $PER_FILE_FLOOR)}"; then
              echo "::error file=workspace/$f::${pct}% < ${PER_FILE_FLOOR}% per-file floor (MCP critical path). See COVERAGE_FLOOR.md."
              FAILED=$((FAILED+1))
            fi
          done
          if [ "$FAILED" -gt 0 ]; then
            echo ""
            echo "$FAILED MCP critical-path file(s) below the ${PER_FILE_FLOOR}% per-file floor."
            echo "These paths handle multi-tenant routing, auth tokens, and inbox dispatch."
            echo "A coverage drop here is the same risk shape as Go-side tokens/secrets files"
            echo "dropping below 10% (see COVERAGE_FLOOR.md). Either:"
            echo "  (a) add tests to raise coverage back above ${PER_FILE_FLOOR}%, or"
            echo "  (b) if this is unavoidable historical debt, file an issue and propose"
            echo "      adjusting the floor with rationale in COVERAGE_FLOOR.md."
            exit 1
          fi
  all-required:
    # Aggregator sentinel — RFC internal#219 §2 (Phase 4 — closes internal#286).
    #
    # Single stable required-status name that branch protection points at;
    # CI churns underneath in `needs:` without any protection edits. Mirrors
    # the molecule-controlplane Phase 2a impl shipped in CP PR#112 and
    # referenced by `internal#286` ("Phase 4 is a single small PR... mirrors
    # CP's existing one").
    #
    # Closes the failure mode where status_check_contexts on molecule-core/main
    # only listed `Secret scan` + `sop-tier-check` (the 2 meta-gates), so real
    # `Platform (Go)` / `Canvas (Next.js)` / `Python Lint & Test` / `Shellcheck`
    # red silently merged through. See internal#286 for the three concrete
    # tonight-of-2026-05-11 incidents that prompted the emergency bump.
    #
    # Three properties of this job each close a failure mode:
    #
    #  1. `if: always()` — runs even when an upstream fails. Without it the
    #     sentinel is `skipped` and protection treats that as missing → merge
    #     ungated.
    #
    #  2. Assertion is `result == "success"` per dep, NOT `!= "failure"`.
    #     A `skipped` upstream (job gated by `if:` evaluating false, matrix
    #     entry that couldn't run) must NOT silently pass through.
    #     `skipped`-as-green is exactly the failure mode this gate closes.
    #
    #  3. `needs:` is the canonical list of "what counts as required."
    #     status_check_contexts will reference only `ci/all-required` (Step 5
    #     follow-up — branch-protection PATCH is Owners-tier per
    #     `feedback_never_admin_merge_bypass`, separate PR); a new job is
    #     added simply by listing it in `needs:` here.
    #     `.gitea/workflows/ci-required-drift.yml` files a [ci-drift] issue
    #     hourly if this list diverges from status_check_contexts or from
    #     audit-force-merge.yml's REQUIRED_CHECKS env (RFC §4 + §6).
    #
    # Excluded from `needs:`: `canvas-deploy-reminder` — gated by
    # `if: ... github.event_name == 'push' && github.ref == 'refs/heads/main'`,
    # so on PR events it's legitimately `skipped`. The drift detector
    # explicitly excludes `github.event_name`-gated jobs from F1 (see
    # `.gitea/scripts/ci-required-drift.py::ci_job_names`).
    #
    # Phase 3 (RFC #219 §1) safety: continue-on-error here so the sentinel
    # does not hard-fail and block PRs while the underlying build jobs are
    # still in Phase 3 (continue-on-error: true suppresses their status to null).
    # When Phase 3 ends (defects fixed, continue-on-error flipped off on build
    # jobs), remove continue-on-error here so the sentinel again hard-fails.
    continue-on-error: true
    runs-on: ubuntu-latest
    timeout-minutes: 1
    needs:
      - changes
      - platform-build
      - canvas-build
      - shellcheck
      - python-lint
    if: always()
    steps:
      - name: Assert every required dependency succeeded
        run: |
          set -euo pipefail
          # `needs.*.result` is one of: success | failure | cancelled | skipped | null.
          # We assert success per dep (not != failure) — see RFC §2 reasoning above.
          # Null results are skipped: they come from Phase 3 (continue-on-error: true
          # suppresses status) or from jobs still in-flight. The sentinel succeeds
          # rather than blocking PRs on Phase 3 noise.
          results='${{ toJSON(needs) }}'
          echo "$results"
          echo "$results" | python3 -c '
          import json, sys
          ns = json.load(sys.stdin)
          # Exclude null (Phase 3 suppressed / in-flight) from the bad list.
          bad = [(k, v.get("result")) for k, v in ns.items()
                 if v.get("result") not in ("success", None)]
          if bad:
              print(f"FAIL: jobs not green:", file=sys.stderr)
              for k, r in bad:
                  print(f"  - {k}: {r}", file=sys.stderr)
              sys.exit(1)
          pending = [(k, v.get("result")) for k, v in ns.items() if v.get("result") is None]
          if pending:
              print(f"WARN: {len(pending)} job(s) still in-flight (result=null): " +
                    ", ".join(k for k, _ in pending), file=sys.stderr)
          print(f"OK: all {len(ns)} required jobs succeeded (or Phase-3 suppressed)")
          '
--- a/.gitea/workflows/continuous-synth-e2e.yml
+++ b/.gitea/workflows/continuous-synth-e2e.yml
@ -1,255 +0,0 @@
 name: Continuous synthetic E2E (staging)
 # Ported from .github/workflows/continuous-synth-e2e.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Hard gate (#2342): cron-driven full-lifecycle E2E that catches
 # regressions visible only at runtime — schema drift, deployment-pipeline
 # gaps, vendor outages, env-var rotations, DNS / CF / Railway side-effects.
 #
 # Why this gate exists:
 #   PR-time CI catches code-level regressions but not deployment-time or
 #   integration-time ones. Today's empirical data:
 #     • #2345 (A2A v0.2 silent drop) — passed all unit tests, broke at
 #       JSON-RPC parse layer between sender and receiver. Visible only
 #       to a sender exercising the full path.
 #     • RFC #2312 chat upload — landed on staging-branch but never
 #       reached staging tenants because publish-workspace-server-image
 #       was main-only. Caught by manual dogfooding hours after deploy.
 #   Both would have surfaced within 15-20 min of regression if a
 #   continuous synth-E2E was running.
 #
 # Cadence: every 20 min (3x/hour). The script is conservatively
 # bounded at 10 min wall-clock; even on degraded staging it should
 # finish before the next firing. cron-overlap is guarded by the
 # concurrency group below.
 #
 # Cost: ~3 runs/hour × 5-10 min × $0.008/min GHA = ~$0.50-$1/day.
 # Plus a fresh tenant provisioned + torn down each run (Railway +
 # AWS pennies). Negligible.
 #
 # Failure handling: when the run fails, the workflow exits non-zero
 # and GitHub's standard email/notification path fires. Operators
 # can subscribe to this workflow's failure channel for paging-grade
 # alerting.
 on:
  schedule:
    # Every 10 minutes, on :02 :12 :22 :32 :42 :52. Three constraints:
    #   1. Stay off the top-of-hour. GitHub Actions scheduler drops
    #      :00 firings under high load (own docs:
    #      https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule).
    #      Prior history: cron was '0,20,40' (2026-05-02) — only :00
    #      ever survived. Bumped to '10,30,50' (2026-05-03) on the
    #      theory that further-from-:00 wins. Empirically 2026-05-04
    #      that ALSO dropped to ~60 min effective cadence (only ~1
    #      schedule fire per hour — see molecule-core#2726). Detection
    #      latency was claimed 20 min, actual 60 min.
    #   2. Avoid colliding with the existing :15 sweep-cf-orphans
    #      and :45 sweep-cf-tunnels — both hit the CF API and we
    #      don't want to fight for rate-limit tokens.
    #   3. Avoid the :30 heavy slot (staging-smoke /30, sweep-aws-
    #      secrets, sweep-stale-e2e-orgs every :15) — multiple
    #      overlapping cron registrations on the same minute is part
    #      of what GH drops under load.
    # Solution: bump fires-per-hour 3 → 6 AND keep all slots in clean
    # lanes (1-3 min away from any other cron). Even with empirically-
    # observed ~67% GH drop ratio, 6 attempts/hour yields ~2 effective
    # fires = ~30 min cadence; closer to the 20-min target than the
    # current shape and provides a real degradation alarm if drops
    # get worse.
    - cron: '2,12,22,32,42,52 * * * *'
 permissions:
  contents: read
  # No issue-write here — failures surface as red runs in the workflow
  # history. If you want auto-issue-on-fail, add a follow-up step that
  # uses gh issue create gated on `if: failure()`. Keeping the surface
  # minimal until that's actually wanted.
 # Serialize so two firings can never overlap. Cron firing every 20 min
 # but scripts conservatively bounded at 10 min — overlap shouldn't
 # happen in steady state, but if a run hangs we don't want N more
 # stacking up.
 concurrency:
  group: continuous-synth-e2e
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  synth:
    name: Synthetic E2E against staging
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
    # (apt-get update + install docker.io/jq/awscli/caddy + snap install
    # ssm-agent) runs from raw Ubuntu on every boot — none of it is
    # pre-baked into the tenant AMI. Empirical fetch_secrets/ok timing
    # across today's canaries: 51s → 82s → 143s → 625s. apt-mirror tail
    # latency drives the boot-to-fetch_secrets phase from ~1min to >10min.
    # A 12min budget leaves only ~2min for the workspace (which needs
    # ~3.5min for claude-code cold boot) on slow-apt days, blowing the
    # budget. 20min absorbs the worst tenant tail so the workspace probe
    # gets the full ~7min it needs even on a slow apt day. Real fix:
    # pre-bake caddy + ssm-agent into the tenant AMI (controlplane#TBD).
    timeout-minutes: 20
    env:
      # claude-code default: cold-start ~5 min (comparable to langgraph),
      # but uses MiniMax-M2.7-highspeed via the template's third-party-
      # Anthropic-compat path (workspace-configs-templates/claude-code-
      # default/config.yaml:64-69). MiniMax is ~5-10x cheaper than
      # gpt-4.1-mini per token AND avoids the recurring OpenAI quota-
      # exhaustion class that took the canary down 2026-05-03 (#265).
      # Operators can pick langgraph / hermes via workflow_dispatch
      # when they specifically need to exercise the OpenAI or SDK-
      # native paths.
      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
      # Pin the canary to a specific MiniMax model rather than relying
      # on the per-runtime default ("sonnet" → routes to direct
      # Anthropic, defeats the cost saving). Operators can override
      # via workflow_dispatch by setting a different E2E_MODEL_SLUG
      # input if they need to exercise a specific model. M2.7-highspeed
      # is "Token Plan only" but cheap-per-token and fast.
      E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7-highspeed' }}
      # Bound to 10 min so a stuck provision fails the run instead of
      # holding up the next cron firing. 15-min default in the script
      # is for the on-PR full lifecycle where we have more headroom.
      E2E_PROVISION_TIMEOUT_SECS: '600'
      # Slug suffix — namespaced "synth-" so these runs are
      # distinguishable from PR-driven runs in CP admin.
      E2E_RUN_ID: synth-${{ github.run_id }}
      # Forced false for cron; respected for manual dispatch
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
      MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # MiniMax key is the canary's PRIMARY auth path. claude-code
      # template's `minimax` provider routes ANTHROPIC_BASE_URL to
      # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot.
      # tests/e2e/test_staging_full_saas.sh branches SECRETS_JSON on
      # which key is present — MiniMax wins when set.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      # Direct-Anthropic alternative for operators who don't want to
      # set up a MiniMax account (priority below MiniMax — first
      # non-empty wins in test_staging_full_saas.sh's secrets-injection
      # block). See #2578 PR comment for the rationale.
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      # OpenAI fallback — kept wired so operators can dispatch with
      # E2E_RUNTIME=langgraph or =hermes and still have a working
      # canary path. The script picks the right blob shape based on
      # which key is non-empty.
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        run: |
          # Hard-fail on missing secret REGARDLESS of trigger. Previously
          # this step soft-skipped on workflow_dispatch via `exit 0`, but
          # `exit 0` only ends the STEP — subsequent steps still ran with
          # the empty secret, the synth script fell through to the wrong
          # SECRETS_JSON branch, and the canary failed 5 min later with a
          # confusing "Agent error (Exception)" instead of the clean
          # "secret missing" message at the top. Caught 2026-05-04 by
          # dispatched run 25296530706: claude-code + missing MINIMAX
          # silently used OpenAI keys but kept model=MiniMax-M2.7, then
          # the workspace 401'd against MiniMax once it tried to call.
          # Fix: exit 1 in both cron and dispatch paths. Operators who
          # want to verify a YAML change without setting up the secret
          # can read the verify-secrets step's stderr — the failure is
          # itself the verification signal.
          if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — synth E2E cannot run"
            echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
            exit 1
          fi
          # LLM-key requirement is per-runtime: claude-code accepts
          # EITHER MiniMax OR direct-Anthropic (whichever is set first),
          # langgraph + hermes use OpenAI (MOLECULE_STAGING_OPENAI_API_KEY).
          case "${E2E_RUNTIME}" in
            claude-code)
              if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
                required_secret_value="${E2E_MINIMAX_API_KEY}"
              elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value="${E2E_ANTHROPIC_API_KEY}"
              else
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value=""
              fi
              ;;
            langgraph|hermes)
              required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
              required_secret_value="${E2E_OPENAI_API_KEY:-}"
              ;;
            *)
              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
              required_secret_name=""
              required_secret_value="present"
              ;;
          esac
          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
            echo "::error::${required_secret_name} secret missing — runtime=${E2E_RUNTIME} cannot authenticate against its LLM provider"
            echo "::error::Set it at Settings → Secrets and Variables → Actions, OR dispatch with a different runtime"
            exit 1
          fi
      - name: Install required tools
        run: |
          # The script depends on jq + curl (already on ubuntu-latest)
          # and python3 (likewise). Verify they're all present so we
          # fail fast on a runner image regression rather than mid-script.
          for cmd in jq curl python3; do
            command -v "$cmd" >/dev/null 2>&1 || {
              echo "::error::required tool '$cmd' not on PATH — runner image regression?"
              exit 1
            }
          done
      - name: Run synthetic E2E
        # The script handles its own teardown via EXIT trap; even on
        # failure (timeout, assertion), the org is deprovisioned and
        # leaks are reported. Exit code propagates from the script.
        run: |
          bash tests/e2e/test_staging_full_saas.sh
      - name: Failure summary
        # Runs only on failure. Adds a job summary so the workflow run
        # page shows a quick "what happened" instead of forcing readers
        # to scroll through script output.
        if: failure()
        run: |
          {
            echo "## Continuous synth E2E failed"
            echo ""
            echo "**Run ID:** ${{ github.run_id }}"
            echo "**Trigger:** ${{ github.event_name }}"
            echo "**Runtime:** ${E2E_RUNTIME}"
            echo "**Slug:** synth-${{ github.run_id }}"
            echo ""
            echo "### What this means"
            echo ""
            echo "Staging just regressed on a path that previously worked. Likely classes:"
            echo "- Schema mismatch between sender and receiver (#2345 class)"
            echo "- Deployment-pipeline gap (RFC #2312 / staging-tenant-image-stale class)"
            echo "- Vendor outage (Cloudflare, Railway, AWS, GHCR)"
            echo "- Staging-CP env var rotation"
            echo ""
            echo "### Next steps"
            echo ""
            echo "1. Check the script output above for the assertion that failed"
            echo "2. If it's a vendor outage, no action needed — next firing in ~20 min"
            echo "3. If it's a code regression, find the causing PR via \`git log\` against last green run and revert/fix"
            echo "4. Keep an eye on the next 1-2 firings — flake vs persistent fail differs in priority"
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.gitea/workflows/e2e-api.yml
+++ b/.gitea/workflows/e2e-api.yml
@ -1,333 +0,0 @@
 name: E2E API Smoke Test
 # Ported from .github/workflows/e2e-api.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Extracted from ci.yml so workflow-level concurrency can protect this job
 # from run-level cancellation (issue #458).
 #
 # Trigger model (revised 2026-04-29):
 #
 # Always FIRES on push/pull_request to staging+main. Real work is gated
 # per-step on `needs.detect-changes.outputs.api` — when paths under
 # `workspace-server/`, `tests/e2e/`, or this workflow file haven't
 # changed, the no-op step alone runs and emits SUCCESS for the
 # `E2E API Smoke Test` check, satisfying branch protection without
 # spending CI cycles. See the in-job comment on the `e2e-api` job for
 # why this is one job (not two-jobs-sharing-name) and the 2026-04-29
 # PR #2264 incident that drove the consolidation.
 #
 # Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
 # -------------------------------------------------------------------
 # Same substrate hazard as PR #98 (handlers-postgres-integration). Our
 # Gitea act_runner runs with `container.network: host` (operator host
 # `/opt/molecule/runners/config.yaml`), which means:
 #
 #   * Two concurrent runs both try to bind their `-p 15432:5432` /
 #     `-p 16379:6379` host ports — the second postgres/redis FATALs
 #     with `Address in use` and `docker run` returns exit 125 with
 #     `Conflict. The container name "/molecule-ci-postgres" is already
 #     in use by container ...`. Verified in run a7/2727 on 2026-05-07.
 #   * The fixed container names `molecule-ci-postgres` / `-redis` (the
 #     pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
 #     `docker rm -f` at the start of the second job KILLS the first
 #     job's still-running postgres/redis.
 #
 # Fix shape (mirrors PR #98's bridge-net pattern, adapted because
 # platform-server is a Go binary on the host, not a containerised
 # step):
 #
 #   1. Unique container names per run:
 #         pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
 #         redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
 #      `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
 #      same run_id.
 #   2. Ephemeral host port per run (`-p 0:5432`), then read the actual
 #      bound port via `docker port` and export DATABASE_URL/REDIS_URL
 #      pointing at it. No fixed host-port → no port collision.
 #   3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
 #      the original flake fixed in #92 and the script's still IPv6-
 #      enabled.
 #   4. `if: always()` cleanup so containers don't leak when test steps
 #      fail.
 #
 # Issue #94 items #2 + #3 (also fixed here):
 #   * Pre-pull `alpine:latest` so the platform-server's provisioner
 #     (`internal/handlers/container_files.go`) can stand up its
 #     ephemeral token-write helper without a daemon.io round-trip.
 #   * Create `molecule-core-net` bridge network if missing so the
 #     provisioner's container.HostConfig {NetworkMode: ...} attach
 #     succeeds.
 # Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
 # 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
 # they DO come up. Timeouts are not the bottleneck; not bumped.
 #
 # Item explicitly NOT fixed here: failing test `Status back online`
 # fails because the platform's langgraph workspace template image
 # (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
 # 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
 # template-registry resolution issue (ADR-002 / local-build mode) and
 # belongs in a separate change that touches workspace-server, not
 # this workflow file.
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
 concurrency:
  # Per-SHA grouping (changed 2026-04-28 from per-ref). Per-ref had the
  # same auto-promote-staging brittleness as e2e-staging-canvas — back-
  # to-back staging pushes share refs/heads/staging, so the older push's
  # queued run gets cancelled when a newer push lands. Auto-promote-
  # staging then sees `completed/cancelled` for the older SHA and stays
  # put; the newer SHA's gates may eventually save the day, but if the
  # newer push gets cancelled too, we deadlock.
  #
  # See e2e-staging-canvas.yml's identical concurrency block for the full
  # rationale and the 2026-04-28 incident reference.
  group: e2e-api-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      api: ${{ steps.decide.outputs.api }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: decide
        # Inline replacement for dorny/paths-filter — same pattern PR#372's
        # ci.yml port used. Diffs against the PR base or push BEFORE SHA,
        # then matches against the api-relevant path set.
        run: |
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "api=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "api=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(workspace-server/|tests/e2e/|\.gitea/workflows/e2e-api\.yml$)'; then
            echo "api=true" >> "$GITHUB_OUTPUT"
          else
            echo "api=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job (no job-level `if:`) that always runs and reports under the
  # required-check name `E2E API Smoke Test`. Real work is gated per-step
  # on `needs.detect-changes.outputs.api`. Reason: GitHub registers a
  # check run for every job that matches `name:`, and a job-level
  # `if: false` produces a SKIPPED check run. Branch protection treats
  # all check runs with a matching context name on the latest commit as a
  # SET — any SKIPPED in the set fails the required-check eval, even with
  # SUCCESS siblings. Verified 2026-04-29 on PR #2264 (staging→main):
  # 4 check runs (2 SKIPPED + 2 SUCCESS) at the head SHA blocked
  # promotion despite all real work succeeding. Collapsing to a single
  # always-running job with conditional steps emits exactly one SUCCESS
  # check run regardless of paths filter — branch-protection-clean.
  e2e-api:
    needs: detect-changes
    name: E2E API Smoke Test
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 15
    env:
      # Unique per-run container names so concurrent runs on the host-
      # network act_runner don't collide on name OR port.
      # `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
      # same run_id. PORT is set later (after docker port lookup) since
      # we let Docker assign an ephemeral host port.
      PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      PORT: "8080"
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.api != 'true'
        run: |
          echo "No workspace-server / tests/e2e / workflow changes — E2E API gate satisfied without running tests."
          echo "::notice::E2E API Smoke Test no-op pass (paths filter excluded this commit)."
      - if: needs.detect-changes.outputs.api == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.detect-changes.outputs.api == 'true'
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
          cache: true
          cache-dependency-path: workspace-server/go.sum
      - name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          # Provisioner uses alpine:latest for ephemeral token-write
          # containers (workspace-server/internal/handlers/container_files.go).
          # Pre-pull so the first provision in test_api.sh doesn't race
          # the daemon's pull cache. Idempotent — `docker pull` is a no-op
          # when the image is already present.
          docker pull alpine:latest >/dev/null
          # Provisioner attaches workspace containers to
          # molecule-core-net (workspace-server/internal/provisioner/
          # provisioner.go::DefaultNetwork). The bridge already exists on
          # the operator host's docker daemon — `network create` is
          # idempotent via `|| true`.
          docker network create molecule-core-net >/dev/null 2>&1 || true
          echo "alpine:latest pre-pulled; molecule-core-net ensured."
      - name: Start Postgres (docker)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          # Defensive cleanup — only matches THIS run's container name,
          # so it cannot kill a sibling run's postgres. (Pre-fix the
          # name was static and this rm hit other runs' containers.)
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          # `-p 0:5432` requests an ephemeral host port; we read it back
          # below and export DATABASE_URL.
          docker run -d --name "$PG_CONTAINER" \
            -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
            -p 0:5432 postgres:16 >/dev/null
          # Resolve the host-side port assignment. `docker port` prints
          # `0.0.0.0:NNNN` (and on host-net runners may also print an
          # IPv6 line — take the first IPv4 line).
          PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          if [ -z "$PG_PORT" ]; then
            # Fallback: any first line. Some Docker versions print only
            # one line.
            PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
          fi
          if [ -z "$PG_PORT" ]; then
            echo "::error::Could not resolve host port for $PG_CONTAINER"
            docker port "$PG_CONTAINER" 5432/tcp || true
            docker logs "$PG_CONTAINER" || true
            exit 1
          fi
          # 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
          echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
          echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
          echo "Postgres host port: ${PG_PORT}"
          for i in $(seq 1 30); do
            if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
              echo "Postgres ready after ${i}s"
              exit 0
            fi
            sleep 1
          done
          echo "::error::Postgres did not become ready in 30s"
          docker logs "$PG_CONTAINER" || true
          exit 1
      - name: Start Redis (docker)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
          docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
          REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          if [ -z "$REDIS_PORT" ]; then
            REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
          fi
          if [ -z "$REDIS_PORT" ]; then
            echo "::error::Could not resolve host port for $REDIS_CONTAINER"
            docker port "$REDIS_CONTAINER" 6379/tcp || true
            docker logs "$REDIS_CONTAINER" || true
            exit 1
          fi
          echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
          echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
          echo "Redis host port: ${REDIS_PORT}"
          for i in $(seq 1 15); do
            if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
              echo "Redis ready after ${i}s"
              exit 0
            fi
            sleep 1
          done
          echo "::error::Redis did not become ready in 15s"
          docker logs "$REDIS_CONTAINER" || true
          exit 1
      - name: Build platform
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: go build -o platform-server ./cmd/server
      - name: Start platform (background)
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: |
          # DATABASE_URL + REDIS_URL exported by the start-postgres /
          # start-redis steps point at this run's per-run host ports.
          ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid
      - name: Wait for /health
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          for i in $(seq 1 30); do
            if curl -sf http://127.0.0.1:8080/health > /dev/null; then
              echo "Platform up after ${i}s"
              exit 0
            fi
            sleep 1
          done
          echo "::error::Platform did not become healthy in 30s"
          cat workspace-server/platform.log || true
          exit 1
      - name: Assert migrations applied
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'")
          if [ "$tables" != "1" ]; then
            echo "::error::Migrations did not apply"
            cat workspace-server/platform.log || true
            exit 1
          fi
          echo "Migrations OK"
      - name: Run E2E API tests
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_api.sh
      - name: Run notify-with-attachments E2E
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_notify_attachments_e2e.sh
      - name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_priority_runtimes_e2e.sh
      - name: Run poll-mode + since_id cursor E2E (#2339)
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_poll_mode_e2e.sh
      - name: Run poll-mode chat upload E2E (RFC #2891)
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_poll_mode_chat_upload_e2e.sh
      - name: Dump platform log on failure
        if: failure() && needs.detect-changes.outputs.api == 'true'
        run: cat workspace-server/platform.log || true
      - name: Stop platform
        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          if [ -f workspace-server/platform.pid ]; then
            kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
          fi
      - name: Stop service containers
        # always() so containers don't leak when test steps fail. The
        # cleanup is best-effort: if the container is already gone
        # (e.g. concurrent rerun race), don't fail the job.
        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
--- a/.gitea/workflows/e2e-staging-canvas.yml
+++ b/.gitea/workflows/e2e-staging-canvas.yml
@ -1,250 +0,0 @@
 name: E2E Staging Canvas (Playwright)
 # Ported from .github/workflows/e2e-staging-canvas.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Playwright test suite that provisions a fresh staging org per run and
 # verifies every workspace-panel tab renders without crashing. Complements
 # e2e-staging-saas.yml (which tests the API shape) by exercising the
 # actual browser + canvas bundle against live staging.
 #
 # Triggers: push to main/staging or PR touching canvas sources + this workflow,
 # manual dispatch, and weekly cron to catch browser/runtime drift even
 # when canvas is quiet.
 # Added staging to push/pull_request branches so the auto-promote gate
 # check (--event push --branch staging) can see a completed run for this
 # workflow — mirrors what PR #1891 does for e2e-api.yml.
 on:
  # Trigger model (revised 2026-04-29):
  #
  # Always fires on push/pull_request; real work is gated per-step on
  # `needs.detect-changes.outputs.canvas`. When canvas/ paths haven't
  # changed, the no-op step alone runs and emits SUCCESS for the
  # `Canvas tabs E2E` check, satisfying branch protection without
  # spending CI cycles. See e2e-api.yml for the rationale on why this
  # is a single job rather than two-jobs-sharing-name.
  push:
    branches: [main]
  pull_request:
    branches: [main]
  schedule:
    # Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
    # release-note-shaped regressions that don't ride in with a PR.
    - cron: '0 8 * * 0'
 concurrency:
  # Per-SHA grouping (changed 2026-04-28 from a single global group). The
  # global group made auto-promote-staging brittle: when a staging push
  # queued behind an in-flight run and a third entrant (a PR run, a
  # follow-on push) entered the group, the staging push got cancelled —
  # leaving auto-promote-staging looking at `completed/cancelled` for a
  # required gate and refusing to advance main. Observed 2026-04-28
  # 23:51-23:53 on staging tip 3f99fede.
  #
  # The original intent of the global group was to throttle parallel
  # E2E provisions (each spins a fresh EC2). At our scale that throttle
  # isn't worth the correctness cost — fresh-org-per-run isolates the
  # state, and the cost of two parallel runs (~$0.001/min × 10min × 2)
  # is rounding error vs. the cost of a stuck pipeline.
  #
  # Per-SHA still dedupes accidental double-triggers for the SAME SHA.
  # It does NOT cancel obsolete-PR-version runs on force-push; that
  # wasted CI is acceptable given the alternative is losing staging-tip
  # data that auto-promote-staging needs.
  group: e2e-staging-canvas-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      canvas: ${{ steps.decide.outputs.canvas }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: decide
        # Inline replacement for dorny/paths-filter — see e2e-api.yml.
        # Cron triggers always run real work (no diff context).
        run: |
          if [ "${{ github.event_name }}" = "schedule" ]; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(canvas/|\.gitea/workflows/e2e-staging-canvas\.yml$)'; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
          else
            echo "canvas=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job (no job-level `if:`) that always runs and reports under the
  # required-check name `Canvas tabs E2E`. Real work is gated per-step on
  # `needs.detect-changes.outputs.canvas`. See e2e-api.yml for the full
  # rationale — same path-filter check-name parity issue blocked PR #2264
  # (staging→main) on 2026-04-29 because branch protection treats matching-
  # name check runs as a SET, and any SKIPPED member fails the eval.
  playwright:
    needs: detect-changes
    name: Canvas tabs E2E
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 40
    env:
      CANVAS_E2E_STAGING: '1'
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
    defaults:
      run:
        working-directory: canvas
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.canvas != 'true'
        working-directory: .
        run: |
          echo "No canvas / workflow changes — E2E Staging Canvas gate satisfied without running tests."
          echo "::notice::E2E Staging Canvas no-op pass (paths filter excluded this commit)."
      - if: needs.detect-changes.outputs.canvas == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        if: needs.detect-changes.outputs.canvas == 'true'
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::Missing CP_STAGING_ADMIN_API_TOKEN"
            exit 2
          fi
      - name: Set up Node
        if: needs.detect-changes.outputs.canvas == 'true'
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '20'
          cache: 'npm'
          cache-dependency-path: canvas/package-lock.json
      - name: Install canvas deps
        if: needs.detect-changes.outputs.canvas == 'true'
        run: npm ci
      - name: Install Playwright browsers
        if: needs.detect-changes.outputs.canvas == 'true'
        run: npx playwright install --with-deps chromium
      - name: Run staging canvas E2E
        if: needs.detect-changes.outputs.canvas == 'true'
        run: npx playwright test --config=playwright.staging.config.ts
      - name: Upload Playwright report on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
        # implement (see ci.yml upload step for the canonical error
        # cite). Drop this pin when Gitea ships the v4 protocol.
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: playwright-report-staging
          path: canvas/playwright-report-staging/
          retention-days: 14
      - name: Upload screenshots on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
        # Pinned to v3 for Gitea act_runner v0.6 compatibility (see above).
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: playwright-screenshots
          path: canvas/test-results/
          retention-days: 14
      # Safety-net teardown — fires only when Playwright's globalTeardown
      # didn't (worker crash, runner cancel). Reads the slug from
      # canvas/.playwright-staging-state.json (written by staging-setup
      # as its first action, before any CP call) and deletes only that
      # slug.
      #
      # Earlier versions of this step pattern-swept `e2e-canvas-<today>-*`
      # orgs to compensate for setup-crash-before-state-file-write. That
      # over-aggressive cleanup raced concurrent canvas-E2E runs and
      # poisoned each other's tenants — observed 2026-04-30 when three
      # real-test runs killed each other mid-test, surfacing as
      # `getaddrinfo ENOTFOUND` once CP had cleaned up the just-deleted
      # DNS record. Pattern-sweep removed; setup now writes the state
      # file before any CP work, so the slug is always recoverable.
      - name: Teardown safety net
        if: always() && needs.detect-changes.outputs.canvas == 'true'
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          STATE_FILE=".playwright-staging-state.json"
          if [ ! -f "$STATE_FILE" ]; then
            echo "::notice::No state file at canvas/$STATE_FILE — Playwright globalTeardown handled it (or setup never ran)."
            exit 0
          fi
          slug=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('slug',''))")
          if [ -z "$slug" ]; then
            echo "::warning::State file present but slug missing; nothing to clean up."
            exit 0
          fi
          echo "Deleting orphan tenant: $slug"
          # Verify HTTP 2xx instead of `>/dev/null || true` swallowing
          # failures. A 5xx or timeout previously looked identical to
          # success, leaving the tenant alive for up to ~45 min until
          # sweep-stale-e2e-orgs caught it. Surface failures as
          # workflow warnings naming the slug. Don't `exit 1` — a single
          # cleanup miss shouldn't fail-flag the canvas test when the
          # actual smoke check passed; the sweeper is the safety net.
          # See molecule-controlplane#420.
          # Tempfile-routed -w + set +e/-e prevents curl-exit-code
          # pollution of the captured status (lint-curl-status-capture.yml).
          set +e
          curl -sS -o /tmp/canvas-cleanup.out -w "%{http_code}" \
            -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
            -H "Authorization: Bearer $ADMIN_TOKEN" \
            -H "Content-Type: application/json" \
            -d "{\"confirm\":\"$slug\"}" >/tmp/canvas-cleanup.code
          set -e
          code=$(cat /tmp/canvas-cleanup.code 2>/dev/null || echo "000")
          if [ "$code" = "200" ] || [ "$code" = "204" ]; then
            echo "[teardown] deleted $slug (HTTP $code)"
          else
            echo "::warning::canvas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canvas-cleanup.out 2>/dev/null)"
          fi
          exit 0
--- a/.gitea/workflows/e2e-staging-external.yml
+++ b/.gitea/workflows/e2e-staging-external.yml
@ -1,192 +0,0 @@
 name: E2E Staging External Runtime
 # Ported from .github/workflows/e2e-staging-external.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Regression for the four/five workspaces.status=awaiting_agent transitions
 # that silently failed in production for five days before migration 046
 # extended the workspace_status enum (see
 # workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql).
 #
 # Why this is its own workflow (not folded into e2e-staging-saas.yml):
 #   - The full-saas harness defaults to runtime=hermes, never exercises
 #     external-runtime. Adding an `external` parameter to that script
 #     would force every push to staging through both lifecycles in
 #     series, doubling the EC2 cold-start budget.
 #   - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER
 #     window, 90s default + sweep interval), which we wait through
 #     deliberately. Folding it into hermes would make the long path
 #     even longer.
 #   - It can run in parallel with the hermes E2E since both create
 #     fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs
 #     `e2e-...`).
 #
 # Triggers:
 #   - Push to staging when any source affecting external runtime,
 #     hibernation, or the migration set changes.
 #   - PR review for the same set.
 #   - Manual workflow_dispatch.
 #   - Daily cron at 07:30 UTC (catches drift on quiet days; staggered
 #     30 min after e2e-staging-saas.yml's 07:00 UTC cron).
 #
 # Concurrency: serialized so two staging pushes don't fight for the
 # same EC2 quota window. cancel-in-progress=false so a half-rolled
 # tenant always finishes its teardown.
 on:
  push:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/workspace.go'
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_restart.go'
      - 'workspace-server/internal/registry/healthsweep.go'
      - 'workspace-server/internal/registry/liveness.go'
      - 'workspace-server/migrations/**'
      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
      - 'tests/e2e/test_staging_external_runtime.sh'
      - '.gitea/workflows/e2e-staging-external.yml'
  pull_request:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/workspace.go'
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_restart.go'
      - 'workspace-server/internal/registry/healthsweep.go'
      - 'workspace-server/internal/registry/liveness.go'
      - 'workspace-server/migrations/**'
      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
      - 'tests/e2e/test_staging_external_runtime.sh'
      - '.gitea/workflows/e2e-staging-external.yml'
  schedule:
    - cron: '30 7 * * *'
 concurrency:
  group: e2e-staging-external
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  e2e-staging-external:
    name: E2E Staging External Runtime
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 25
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
      E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            # Schedule + push triggers must hard-fail when the token is
            # missing — silent skip would mask infra rot. Manual dispatch
            # gets the same hard-fail; an operator running this on a fork
            # without secrets configured needs to know up-front.
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
            exit 2
          fi
          echo "Admin token present ✓"
      - name: CP staging health preflight
        run: |
          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
          if [ "$code" != "200" ]; then
            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
            exit 1
          fi
          echo "Staging CP healthy ✓"
      - name: Run external-runtime E2E
        id: e2e
        run: bash tests/e2e/test_staging_external_runtime.sh
      # Mirror the e2e-staging-saas.yml safety net: if the runner is
      # cancelled (e.g. concurrent staging push), the test script's
      # EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to
      # *this* run id.
      - name: Teardown safety net (runs on cancel/failure)
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys, os, datetime
          run_id = os.environ.get('GITHUB_RUN_ID', '')
          d = json.load(sys.stdin)
          # Scope STRICTLY to this run id (e2e-ext-YYYYMMDD-<runid>-...)
          # so concurrent runs and unrelated dev probes are not touched.
          # Sweep today AND yesterday so a midnight-crossing run still
          # cleans up its own slug.
          today = datetime.date.today()
          yesterday = today - datetime.timedelta(days=1)
          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
          if not run_id:
              # Without a run id we cannot scope safely; bail rather
              # than risk deleting unrelated tenants.
              sys.exit(0)
          prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates)
          for o in d.get('orgs', []):
              s = o.get('slug', '')
              if s.startswith(prefixes) and o.get('status') != 'purged':
                  print(s)
          " 2>/dev/null)
          if [ -n "$orgs" ]; then
            echo "Safety-net sweep: deleting leftover orgs:"
            echo "$orgs"
            # Per-slug verified DELETE — see molecule-controlplane#420.
            # `>/dev/null 2>&1` previously hid every failure; surface
            # non-2xx as workflow warnings so the run page names what
            # leaked. Sweeper catches the rest within ~45 min.
            leaks=()
            for slug in $orgs; do
              # Tempfile-routed -w + set +e/-e prevents curl-exit-code
              # pollution of the captured status (lint-curl-status-capture.yml).
              set +e
              curl -sS -o /tmp/external-cleanup.out -w "%{http_code}" \
                -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
                -H "Authorization: Bearer $ADMIN_TOKEN" \
                -H "Content-Type: application/json" \
                -d "{\"confirm\":\"$slug\"}" >/tmp/external-cleanup.code
              set -e
              code=$(cat /tmp/external-cleanup.code 2>/dev/null || echo "000")
              if [ "$code" = "200" ] || [ "$code" = "204" ]; then
                echo "[teardown] deleted $slug (HTTP $code)"
              else
                echo "::warning::external teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/external-cleanup.out 2>/dev/null)"
                leaks+=("$slug")
              fi
            done
            if [ ${#leaks[@]} -gt 0 ]; then
              echo "::warning::external teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
            fi
          else
            echo "Safety-net sweep: no leftover orgs to clean."
          fi
--- a/.gitea/workflows/e2e-staging-saas.yml
+++ b/.gitea/workflows/e2e-staging-saas.yml
@ -1,287 +0,0 @@
 name: E2E Staging SaaS (full lifecycle)
 # Ported from .github/workflows/e2e-staging-saas.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Dedicated workflow that provisions a fresh staging org per run, exercises
 # the full workspace lifecycle (register → heartbeat → A2A → delegation →
 # HMA memory → activity → peers), then tears down and asserts leak-free.
 #
 # Why a separate workflow (not folded into ci.yml):
 #   - The run takes ~25-35 min (EC2 boot + cloudflared DNS + provision sweeps +
 #     agent bootstrap), way too slow for every PR.
 #   - Needs its own concurrency group so two pushes don't fight over the
 #     same staging org slug prefix.
 #   - Has its own required secrets (session cookie, admin token) that most
 #     PRs don't need to read.
 #
 # Triggers:
 #   - Push to main (regression guard — fires on merges to main, not on PR updates)
 #   - pull_request: pr-validate always posts success; real E2E step runs only
 #     when provisioning-critical files change (detect-changes gates the step).
 #   - workflow_dispatch (manual re-run from UI)
 #   - Nightly cron (catches drift even when no pushes land)
 #
 # NOTE: A separate pr-validate job handles the pull_request path so this
 # workflow posts CI status for workflow-only PRs. Without it, a PR that
 # only touches the workflow file has no status check (workflow only fires
 # on push, not PR branches), which blocks merge under branch protection.
 # The E2E step itself only runs when provisioning-critical files change —
 # pr-validate always posts success, avoiding the double-fire that motivated
 # the pull_request-trigger removal in PRs #516/#530.
 on:
  # Trunk-based (Phase 3 of internal#81): main is the only branch.
  push:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_provision.go'
      - 'workspace-server/internal/handlers/a2a_proxy.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/provisioner/**'
      - 'tests/e2e/test_staging_full_saas.sh'
      - '.gitea/workflows/e2e-staging-saas.yml'
  pull_request:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_provision.go'
      - 'workspace-server/internal/handlers/a2a_proxy.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/provisioner/**'
      - 'tests/e2e/test_staging_full_saas.sh'
      - '.gitea/workflows/e2e-staging-saas.yml'
  workflow_dispatch:
  schedule:
    # 07:00 UTC every day — catches AMI drift, WorkOS cert rotation,
    # Cloudflare API regressions, etc. even on quiet days.
    - cron: '0 7 * * *'
 # Serialize: staging has a finite per-hour org creation quota. Two pushes
 # landing in quick succession should queue, not race. `cancel-in-progress:
 # false` mirrors e2e-api.yml — GitHub would otherwise cancel the running
 # teardown step and leave orphan EC2s.
 concurrency:
  group: e2e-staging-saas
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  # PR-validation path: always posts success so branch protection can merge
  # workflow-only PRs. The actual E2E step only runs when provisioning-
  # critical files change (git-paths filter + if: guard below).
  # All steps use continue-on-error: true so runner issues do not block merge.
  pr-validate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 1
        continue-on-error: true
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
        continue-on-error: true
      - name: YAML validation (best-effort)
        run: |
          echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid."
          echo "E2E step runs only when provisioning-critical files change."
        continue-on-error: true
  # Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only
  # path — pr-validate above posts success for workflow-only PRs.
  e2e-staging-saas:
    name: E2E Staging SaaS
    runs-on: ubuntu-latest
    # Only runs on trunk pushes. PR paths get pr-validate instead.
    if: github.event.pull_request.base.ref == ''
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 45
    permissions:
      contents: read
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # Single admin-bearer secret drives provision + tenant-token
      # retrieval + teardown. Configure in
      # Settings → Secrets and variables → Actions → Repository secrets.
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched
      # from hermes+OpenAI default after #2578 (the staging OpenAI key
      # account went over quota and stayed dead for 36+ hours, taking
      # the full-lifecycle E2E red on every provisioning-critical push).
      # claude-code template's `minimax` provider routes
      # ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads
      # MINIMAX_API_KEY at boot — separate billing account so an
      # OpenAI quota collapse no longer wedges the gate. Mirrors the
      # staging-smoke.yml + continuous-synth-e2e.yml migrations.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      # Direct-Anthropic alternative for operators who don't want to
      # set up a MiniMax account (priority below MiniMax — first
      # non-empty wins in test_staging_full_saas.sh's secrets-injection
      # block). See #2578 PR comment for the rationale.
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      # OpenAI fallback — kept wired so an operator-dispatched run with
      # E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still
      # exercise the OpenAI path.
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
      # Pin the model when running on the default claude-code path —
      # the per-runtime default ("sonnet") routes to direct Anthropic
      # and defeats the cost saving. Operators can override via the
      # workflow_dispatch flow (no input wired here yet — runtime
      # override is enough for ad-hoc).
      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
            exit 2
          fi
          echo "Admin token present ✓"
      - name: Verify LLM key present
        run: |
          # Per-runtime key check — claude-code uses MiniMax; hermes /
          # langgraph (operator-dispatched only) use OpenAI. Hard-fail
          # rather than soft-skip per #2578's lesson — empty key
          # silently falls through to the wrong SECRETS_JSON branch and
          # produces a confusing auth error 5 min later instead of the
          # clean "secret missing" message at the top.
          case "${E2E_RUNTIME}" in
            claude-code)
              # Either MiniMax OR direct-Anthropic works — first
              # non-empty wins in the test script's secrets-injection
              # priority chain.
              if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
                required_secret_value="${E2E_MINIMAX_API_KEY}"
              elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value="${E2E_ANTHROPIC_API_KEY}"
              else
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value=""
              fi
              ;;
            langgraph|hermes)
              required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
              required_secret_value="${E2E_OPENAI_API_KEY:-}"
              ;;
            *)
              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
              required_secret_name=""
              required_secret_value="present"
              ;;
          esac
          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
            echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — workspaces will fail at boot with 'No provider API key found'"
            exit 2
          fi
          echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
      - name: CP staging health preflight
        run: |
          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
          if [ "$code" != "200" ]; then
            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
            exit 1
          fi
          echo "Staging CP healthy ✓"
      - name: Run full-lifecycle E2E
        id: e2e
        run: bash tests/e2e/test_staging_full_saas.sh
      # Belt-and-braces teardown: the test script itself installs a trap
      # for EXIT/INT/TERM, but if the GH runner itself is cancelled (e.g.
      # someone pushes a new commit and workflow concurrency is set to
      # cancel), the trap may not fire. This `always()` step runs even on
      # cancellation and attempts the delete a second time. The admin
      # DELETE endpoint is idempotent so double-invoking is safe.
      - name: Teardown safety net (runs on cancel/failure)
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          # Best-effort: find any e2e-YYYYMMDD-* orgs matching this run and
          # nuke them. Catches the case where the script died before
          # exporting its slug.
          set +e
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys, os, datetime
          run_id = os.environ.get('GITHUB_RUN_ID', '')
          d = json.load(sys.stdin)
          # ONLY sweep slugs from *this* CI run. Previously the filter was
          # f'e2e-{today}-' which stomped on parallel CI runs AND any manual
          # E2E probes a dev was running against staging (incident 2026-04-21
          # 15:02Z: this workflow's safety net deleted an unrelated manual
          # run's tenant 1s after it hit 'running').
          # Sweep both today AND yesterday's UTC dates so a run that crosses
          # midnight still matches its own slug — see the 2026-04-26→27
          # canvas-safety-net incident for the same bug class.
          today = datetime.date.today()
          yesterday = today - datetime.timedelta(days=1)
          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
          if run_id:
              prefixes = tuple(f'e2e-{d}-{run_id}-' for d in dates)
          else:
              prefixes = tuple(f'e2e-{d}-' for d in dates)
          candidates = [o['slug'] for o in d.get('orgs', [])
                        if any(o.get('slug','').startswith(p) for p in prefixes)
                        and o.get('instance_status') not in ('purged',)]
          print('\n'.join(candidates))
          " 2>/dev/null)
          # Per-slug verified DELETE (was `>/dev/null || true` — see
          # molecule-controlplane#420). Surface non-2xx as a workflow
          # warning naming the leaked slug; don't exit 1 (sweeper is
          # the safety net within ~45 min).
          leaks=()
          for slug in $orgs; do
            echo "Safety-net teardown: $slug"
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/saas-cleanup.out -w "%{http_code}" \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/saas-cleanup.code
            set -e
            code=$(cat /tmp/saas-cleanup.code 2>/dev/null || echo "000")
            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
              echo "[teardown] deleted $slug (HTTP $code)"
            else
              echo "::warning::saas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/saas-cleanup.out 2>/dev/null)"
              leaks+=("$slug")
            fi
          done
          if [ ${#leaks[@]} -gt 0 ]; then
            echo "::warning::saas teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
          fi
          exit 0
--- a/.gitea/workflows/e2e-staging-sanity.yml
+++ b/.gitea/workflows/e2e-staging-sanity.yml
@ -1,166 +0,0 @@
 name: E2E Staging Sanity (leak-detection self-check)
 # Ported from .github/workflows/e2e-staging-sanity.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `workflow_dispatch:` (Gitea 1.22.6 finicky on bare dispatch).
 #   - `actions/github-script@v9` issue-open block replaced with curl
 #     calls to the Gitea REST API (/api/v1/repos/.../issues|comments).
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Periodic assertion that the teardown safety nets in e2e-staging-saas
 # and staging-smoke (formerly canary-staging) actually work. Runs the
 # E2E harness with E2E_INTENTIONAL_FAILURE=1, which poisons the tenant
 # admin token after the org is provisioned. The workspace-provision
 # step then fails, the script exits non-zero, and the EXIT trap +
 # workflow always()-step must still tear down cleanly.
 on:
  schedule:
    - cron: '0 6 * * 1'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: e2e-staging-sanity
  cancel-in-progress: false
 permissions:
  issues: write
  contents: read
 jobs:
  sanity:
    name: Intentional-failure teardown sanity
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 20
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      E2E_MODE: smoke
      E2E_RUNTIME: hermes
      E2E_RUN_ID: "sanity-${{ github.run_id }}"
      E2E_INTENTIONAL_FAILURE: "1"
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
            exit 2
          fi
      # Inverted assertion: the run MUST fail. If it passes, the
      # E2E_INTENTIONAL_FAILURE path is broken.
      - name: Run harness — expecting exit !=0
        id: harness
        run: |
          set +e
          bash tests/e2e/test_staging_full_saas.sh
          rc=$?
          echo "harness_rc=$rc" >> "$GITHUB_OUTPUT"
          if [ "$rc" = "1" ]; then
            echo "OK Harness failed as expected (rc=1); teardown trap ran, leak-check passed"
            exit 0
          elif [ "$rc" = "0" ]; then
            echo "::error::Harness succeeded under E2E_INTENTIONAL_FAILURE=1 — the poisoning path is broken"
            exit 1
          elif [ "$rc" = "4" ]; then
            echo "::error::LEAK DETECTED (rc=4) — teardown failed to clean up the org. Safety net broken."
            exit 4
          else
            echo "::error::Unexpected rc=$rc — neither clean-failure nor leak. Investigate harness."
            exit 1
          fi
      - name: Open issue if safety net is broken (Gitea API)
        if: failure()
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          TITLE="E2E teardown safety net broken"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          BODY_JSON=$(jq -nc --arg t "$TITLE" --arg run "$RUN_URL" '
            {title: $t,
             body: ("The weekly sanity run (E2E_INTENTIONAL_FAILURE=1) did not exit as expected. This means one of:\n  - poisoning did not actually cause failure (test harness regression), OR\n  - teardown left an orphan org (leak detection caught a real bug)\n\nRun: " + $run + "\n\nThis is higher priority than a canary failure — the whole E2E safety net cannot be trusted until this is resolved.")}')
          EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
          if [ -n "$EXISTING" ]; then
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${EXISTING}/comments" \
              -d "$(jq -nc --arg run "$RUN_URL" '{body: ("Still broken. " + $run)}')" >/dev/null
            echo "Commented on existing issue #${EXISTING}"
          else
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues" -d "$BODY_JSON" >/dev/null
            echo "Filed new issue"
          fi
      # Belt-and-braces: if teardown left anything behind, nuke it here
      # so we don't bleed staging quota.
      - name: Teardown safety net
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys
          d = json.load(sys.stdin)
          today = __import__('datetime').date.today().strftime('%Y%m%d')
          # Match both the new e2e-smoke- prefix (post-2026-05-11 rename)
          # and the legacy e2e-canary- prefix for one rollout cycle so
          # any in-flight org provisioned under the old prefix on an
          # older runner checkout still gets cleaned up. Remove the
          # canary fallback after one week of no-old-prefix observations.
          prefixes = (f'e2e-smoke-{today}-sanity-', f'e2e-canary-{today}-sanity-')
          candidates = [o['slug'] for o in d.get('orgs', [])
                        if any(o.get('slug','').startswith(p) for p in prefixes)
                        and o.get('status') not in ('purged',)]
          print('\n'.join(candidates))
          " 2>/dev/null)
          leaks=()
          for slug in $orgs; do
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/sanity-cleanup.out -w "%{http_code}" \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/sanity-cleanup.code
            set -e
            code=$(cat /tmp/sanity-cleanup.code 2>/dev/null || echo "000")
            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
              echo "[teardown] deleted $slug (HTTP $code)"
            else
              echo "::warning::sanity teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/sanity-cleanup.out 2>/dev/null)"
              leaks+=("$slug")
            fi
          done
          if [ ${#leaks[@]} -gt 0 ]; then
            echo "::warning::sanity teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
          fi
          exit 0
--- a/.gitea/workflows/gate-check-v3.yml
+++ b/.gitea/workflows/gate-check-v3.yml
@ -1,105 +0,0 @@
 # gate-check-v3 — automated PR gate detector
 #
 # Runs on every open PR (push/synchronize) and hourly via cron.
 # Posts a structured [gate-check-v3] STATUS: comment on the PR.
 #
 # Inputs:
 #   PR_NUMBER  — set via ${{ github.event.pull_request.number }} from the trigger
 #   POST_COMMENT — "true" to post/update comment on PR
 #
 # Gating logic (MVP signals 1,2,3,6):
 #   1. Author-aware agent-tag comment scan
 #   2. REQUEST_CHANGES reviews state machine
 #   3. Staleness detection (SOP-12: review.commit_id != PR.head_sha + >1 working day)
 #   6. CI required-checks awareness
 #
 # Exit code: 0=CLEAR, 1=BLOCKED, 2=ERROR
 name: gate-check-v3
 on:
  pull_request_target:
    types: [opened, edited, synchronize, reopened]
  schedule:
    # Hourly: refresh all open PRs
    - cron: '8 * * * *'
  # NOTE: `workflow_dispatch.inputs` block intentionally omitted.
  # Gitea 1.22.6 parser rejects `workflow_dispatch.inputs.X` with
  # "unknown on type" — it mis-treats the inputs sub-keys as top-level
  # `on:` event types. Dropping the inputs block restores parsing.
  # Manual dispatch from the Gitea UI works without the inputs schema
  # (github.event.inputs.X returns empty); the script falls back to
  # iterating all open PRs when PR_NUMBER is empty.
  workflow_dispatch:
 permissions:
  # read: contents — for checkout (base ref, not PR head for security)
  # read: pull-requests — for reading PR info via API
  # write: pull-requests — for posting/updating gate-check comments
  #   Without this the token cannot POST/PATCH /issues/comments → 403.
  contents: read
  pull-requests: write
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  gate-check:
    runs-on: ubuntu-latest
    continue-on-error: true  # Never block on our own detector failing
    steps:
      - name: Check out BASE ref (never PR-head under pull_request_target)
        # pull_request_target runs with repo secrets-context, so checking out
        # the PR HEAD would execute PR-branch gate_check.py with secrets.
        # Fix: always load gate_check.py from the trusted base/default ref.
        # Bug-1 (self-loop exclusion) + Bug-3 (403→exit0) from #547 are
        # kept; only this checkout-ref regresses to pre-#547 behavior.
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ github.event.pull_request.base.sha || github.ref_name }}
      - name: Run gate-check-v3 (single PR mode)
        if: github.event_name == 'pull_request_target' || github.event.inputs.pr_number != ''
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number || github.event.inputs.pr_number }}
          POST_COMMENT: ${{ github.event.inputs.post_comment || 'true' }}
        run: |
          set -euo pipefail
          python3 tools/gate-check-v3/gate_check.py \
            --repo "${{ github.repository }}" \
            --pr "$PR_NUMBER" \
            $([ "$POST_COMMENT" = "true" ] && echo "--post-comment")
          echo "verdict=$?" >> "$GITHUB_OUTPUT"
      - name: Run gate-check-v3 (all open PRs — cron mode)
        if: github.event_name == 'schedule'
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
        run: |
          set -euo pipefail
          # Fetch all open PRs and run gate-check on each
          # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN.
          # gate_check.py uses timeout=15 on every urlopen call; this catches the
          # inline Python polling loop too (issue #603).
          pr_numbers=$(python3 -c "
            import socket, urllib.request, json, os
            socket.setdefaulttimeout(15)
            token = os.environ['GITEA_TOKEN']
            req = urllib.request.Request(
                'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
                headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
            )
            with urllib.request.urlopen(req) as r:
                prs = json.loads(r.read())
            for pr in prs:
                print(pr['number'])
          ")
          for pr in $pr_numbers; do
            echo "Checking PR #$pr..."
            python3 tools/gate-check-v3/gate_check.py \
              --repo "${{ github.repository }}" \
              --pr "$pr" \
              --post-comment \
              || true
          done
--- a/.gitea/workflows/handlers-postgres-integration.yml
+++ b/.gitea/workflows/handlers-postgres-integration.yml
@ -1,282 +0,0 @@
 name: Handlers Postgres Integration
 # Ported from .github/workflows/handlers-postgres-integration.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Real-Postgres integration tests for workspace-server/internal/handlers/.
 # Triggered on every PR/push that touches the handlers package.
 #
 # Why this workflow exists
 # ------------------------
 # Strict-sqlmock unit tests pin which SQL statements fire — they're fast
 # and let us iterate without a DB. But sqlmock CANNOT detect bugs that
 # depend on the row state AFTER the SQL runs. The result_preview-lost
 # bug shipped to staging in PR #2854 because every unit test was
 # satisfied with "an UPDATE statement fired" — none verified the row's
 # preview field actually landed. The local-postgres E2E that retrofit
 # self-review caught it took 2 minutes to set up and would have caught
 # the bug at PR-time.
 #
 # Why this workflow does NOT use `services: postgres:` (Class B fix)
 # ------------------------------------------------------------------
 # Our act_runner config has `container.network: host` (operator host
 # /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
 # the job container AND every service container. With host-net, two
 # concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
 # second postgres FATALs with `could not create any TCP/IP sockets:
 # Address in use`, and Docker auto-removes it (act_runner sets
 # AutoRemove:true on service containers). By the time the migrations
 # step runs `psql`, the postgres container is gone, hence
 # `Connection refused` then `failed to remove container: No such
 # container` at cleanup time.
 #
 # Per-job `container.network` override is silently ignored by
 # act_runner — `--network and --net in the options will be ignored.`
 # appears in the runner log. Documented constraint.
 #
 # So we sidestep `services:` entirely. The job container still uses
 # host-net (inherited from runner config; required for cache server
 # discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
 # postgres on the existing `molecule-core-net` bridge with a
 # UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
 # read its bridge IP via `docker inspect`. A host-net job container
 # can reach a bridge-net container directly via the bridge IP (verified
 # manually on operator host 2026-05-08).
 #
 # Trade-offs vs. the original `services:` shape:
 #   + No host-port collision; N parallel runs share the bridge cleanly
 #   + `if: always()` cleanup runs even on test-step failure
 #   - One more step in the workflow (+~3 lines)
 #   - Requires `molecule-core-net` to exist on the operator host
 #     (it does; declared in docker-compose.yml + docker-compose.infra.yml)
 #
 # Class B Hongming-owned CICD red sweep, 2026-05-08.
 #
 # Cost: ~30s job (postgres pull from cache + go build + 4 tests).
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
 concurrency:
  group: handlers-pg-integ-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    name: detect-changes
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      handlers: ${{ steps.filter.outputs.handlers }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: filter
        # Inline replacement for dorny/paths-filter — see e2e-api.yml.
        run: |
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "handlers=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "handlers=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(workspace-server/internal/handlers/|workspace-server/internal/wsauth/|workspace-server/migrations/|\.gitea/workflows/handlers-postgres-integration\.yml$)'; then
            echo "handlers=true" >> "$GITHUB_OUTPUT"
          else
            echo "handlers=false" >> "$GITHUB_OUTPUT"
          fi
  # Single-job-with-per-step-if pattern: always runs to satisfy the
  # required-check name on branch protection; real work gates on the
  # paths filter. See ci.yml's Platform (Go) for the same shape.
  integration:
    name: Handlers Postgres Integration
    needs: detect-changes
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    env:
      # Unique name per run so concurrent jobs don't collide on the
      # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
      # workflow_dispatch reruns of the same run_id.
      PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
      # Bridge network already exists on the operator host (declared
      # in docker-compose.yml + docker-compose.infra.yml).
      PG_NETWORK: molecule-core-net
    defaults:
      run:
        working-directory: workspace-server
    steps:
      - if: needs.detect-changes.outputs.handlers != 'true'
        working-directory: .
        run: echo "No handlers/migrations changes — skipping; this job always runs to satisfy the required-check name."
      - if: needs.detect-changes.outputs.handlers == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.detect-changes.outputs.handlers == 'true'
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Start sibling Postgres on bridge network
        working-directory: .
        run: |
          # Sanity: the bridge network must exist on the operator host.
          # Hard-fail loud if it doesn't — easier to spot than a silent
          # auto-create that diverges from the rest of the stack.
          if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
            echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
            exit 1
          fi
          # If a stale container with the same name exists (rerun on
          # the same run_id), wipe it first.
          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
          docker run -d \
            --name "${PG_NAME}" \
            --network "${PG_NETWORK}" \
            --health-cmd "pg_isready -U postgres" \
            --health-interval 5s \
            --health-timeout 5s \
            --health-retries 10 \
            -e POSTGRES_PASSWORD=test \
            -e POSTGRES_DB=molecule \
            postgres:15-alpine >/dev/null
          # Read back the bridge IP. Always present immediately after
          # `docker run -d` for bridge networks.
          PG_HOST=$(docker inspect "${PG_NAME}" \
            --format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
          if [ -z "${PG_HOST}" ]; then
            echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
            docker logs "${PG_NAME}" || true
            exit 1
          fi
          echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
          echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
          echo "Started ${PG_NAME} at ${PG_HOST}:5432"
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Apply migrations to Postgres service
        env:
          PGPASSWORD: test
        run: |
          # Wait for postgres to actually accept connections. Docker's
          # health-cmd handles container-side readiness, but the wire
          # to the bridge IP is best-tested with pg_isready directly.
          for i in {1..15}; do
            if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
            echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
          done
          # Apply every .up.sql in lexicographic order with
          # ON_ERROR_STOP=0 — failing migrations are SKIPPED rather than
          # blocking the suite. This handles the current schema state
          # where a few historical migrations (e.g. 017_memories_fts_*)
          # depend on tables that were later renamed/dropped and so
          # cannot replay from scratch. The migrations that DO succeed
          # land their tables, which is sufficient for the integration
          # tests in handlers/.
          #
          # Why not maintain a curated allowlist: every new migration
          # touching a handlers/-tested table would have to update this
          # workflow. With apply-all-or-skip, a future migration that
          # adds a column to delegations runs automatically (its base
          # table 049_delegations.up.sql already succeeded above it in
          # the order). Operators only need to revisit this if the
          # migration chain becomes legitimately replayable end-to-end.
          #
          # Per-migration result is logged so a failed migration that
          # SHOULD have been replayable surfaces in the CI log instead
          # of silently failing.
          # Apply both *.sql (legacy, lives next to its module) and
          # *.up.sql (newer up/down convention) in a single
          # lexicographically-sorted pass. Excluding *.down.sql so the
          # newest-naming-convention pairs don't undo themselves mid-run.
          # Pre-#149-followup this loop only globbed *.up.sql, which
          # silently skipped 001_workspaces.sql + 009_activity_logs.sql
          # — fine while no integration test depended on those tables,
          # not fine once a cross-table atomicity test came in.
          set +e
          for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
            if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
                  -f "$migration" >/dev/null 2>&1; then
              echo "✓ $(basename "$migration")"
            else
              echo "⊘ $(basename "$migration") (skipped — see comment in workflow)"
            fi
          done
          set -e
          # Sanity: the delegations + workspaces + activity_logs tables
          # MUST exist for the integration tests to be meaningful. Hard-
          # fail if any didn't land — that would be a real regression we
          # want loud.
          for tbl in delegations workspaces activity_logs pending_uploads; do
            if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
                -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
                | grep -q 1; then
              echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
              exit 1
            fi
            echo "✓ $tbl table present"
          done
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Run integration tests
        run: |
          # INTEGRATION_DB_URL is exported by the start-postgres step;
          # points at the per-run bridge IP, not 127.0.0.1, so concurrent
          # workflow runs don't fight over a host-net 5432 port.
          go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
      - if: failure() && needs.detect-changes.outputs.handlers == 'true'
        name: Diagnostic dump on failure
        env:
          PGPASSWORD: test
        run: |
          echo "::group::postgres container status"
          docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
          docker logs "${PG_NAME}" 2>&1 | tail -50 || true
          echo "::endgroup::"
          echo "::group::delegations table state"
          psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
          echo "::endgroup::"
      - if: always() && needs.detect-changes.outputs.handlers == 'true'
        name: Stop sibling Postgres
        working-directory: .
        run: |
          # always() so containers don't leak when migrations or tests
          # fail. The cleanup is best-effort: if the container is
          # already gone (e.g. concurrent rerun race), don't fail the job.
          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
          echo "Cleaned up ${PG_NAME}"
--- a/.gitea/workflows/harness-replays.yml
+++ b/.gitea/workflows/harness-replays.yml
@ -1,302 +0,0 @@
 name: Harness Replays
 # Ported from .github/workflows/harness-replays.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Boots tests/harness (production-shape compose topology with TenantGuard,
 # /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
 # every replay under tests/harness/replays/. Fails the PR if any replay
 # fails.
 #
 # Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
 # a public route in router.go but forgot to add it to TenantGuard's
 # allowlist. The handler-level test in buildinfo_test.go constructed a
 # minimal gin engine without TenantGuard — green. The harness's
 # buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
 # inject X-Molecule-Org-Id, so the curl path is identical to production's
 # redeploy verifier), but no one ran the harness pre-merge. The bug
 # shipped; the redeploy verifier silently soft-warned every tenant as
 # "unreachable" for ~1 day before being noticed.
 #
 # This gate makes "did you actually run the harness?" a CI invariant
 # instead of a memory-discipline thing.
 #
 # Trigger model — match e2e-api.yml: always FIRES on push/pull_request
 # to staging+main, real work is gated per-step on detect-changes output.
 # One job → one check run → branch-protection-clean (the SKIPPED-in-set
 # trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
 "on":
  push:
    branches: [main, staging]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'tests/harness/**'
      - '.gitea/workflows/harness-replays.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'tests/harness/**'
      - '.gitea/workflows/harness-replays.yml'
 concurrency:
  # Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
  # cancellation deadlock — see e2e-api.yml's concurrency block for
  # the 2026-04-28 incident that codified this pattern.
  group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      run: ${{ steps.decide.outputs.run }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          # Shallow clone — we use the Gitea Compare API for changed-file
          # detection, not local git diff. The base SHA is supplied via
          # GitHub event variables, so no local history is needed.
          fetch-depth: 1
      - id: decide
        env:
          # Pass via env block — env values bypass shell quoting so single
          # quotes in merge-commit messages (e.g. "Merge pull request 'fix: ...'
          # from branch into main") cannot break the bash parser. The prior
          # `echo '${{ toJSON(...) }}'` form broke on every main-push because
          # every main commit is a merge commit with single quotes in the
          # message body — the embedded `'` ended the single-quoted shell string
          # mid-JSON, and a subsequent `(` (e.g. in `(#523)`) was parsed as a
          # subshell, causing "syntax error near unexpected token `('".
          COMMITS_JSON: ${{ toJSON(github.event.commits) }}
        run: |
          set -euo pipefail
          # workflow_dispatch: always run (manual trigger)
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=manual-trigger" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Determine changed files.
          # workflow_dispatch: always run.
          # pull_request: use Compare API (branch-to-branch works fine).
          # push: use github.event.commits array (Compare API rejects SHA-to-branch).
          # new-branch: run everything.
          if [ "${{ github.event_name }}" = "pull_request" ]; then
            BASE="${{ github.event.pull_request.base.ref }}"
            HEAD="${{ github.event.pull_request.head.ref }}"
          elif [ -n "${{ github.event.before }}" ] && \
               ! echo "${{ github.event.before }}" | grep -qE '^0+$'; then
            # Push event: extract changed files from github.event.commits array.
            # Gitea Compare API rejects SHA-to-branch comparisons (BaseNotExist),
            # so we use the commits array instead. This array contains all commits
            # in the push, each with their added/removed/modified file lists.
            printf '%s' "$COMMITS_JSON" \
              | bash .gitea/scripts/push-commits-diff-files.py \
              > .push-diff-files.txt 2>/dev/null || true
            DIFF_FILES=$(cat .push-diff-files.txt 2>/dev/null || true)
            if [ -n "$DIFF_FILES" ] && echo "$DIFF_FILES" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.gitea/workflows/harness-replays\.yml$'; then
              echo "run=true" >> "$GITHUB_OUTPUT"
            else
              echo "run=false" >> "$GITHUB_OUTPUT"
            fi
            echo "debug=push-files=$DIFF_FILES" >> "$GITHUB_OUTPUT"
            exit 0
          else
            # New branch or github.event.before unavailable — run everything.
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=new-branch-fallback" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Call Gitea Compare API (pull_request path only — branch-to-branch).
          # Push uses github.event.commits array above.
          RESP=$(curl -sS --fail --max-time 30 \
            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
            -H "Accept: application/json" \
            "$GITHUB_SERVER_URL/api/v1/repos/$GITHUB_REPOSITORY/compare/$BASE...$HEAD")
          DIFF_FILES=$(echo "$RESP" | bash .gitea/scripts/compare-api-diff-files.py 2>/dev/null || true)
          echo "debug=diff-base=$BASE diff-files=$DIFF_FILES" >> "$GITHUB_OUTPUT"
          if echo "$DIFF_FILES" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.gitea/workflows/harness-replays\.yml$'; then
            echo "run=true" >> "$GITHUB_OUTPUT"
          else
            echo "run=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job that always runs. Real work is gated per-step on
  # detect-changes.outputs.run so an unrelated PR (e.g. doc-only
  # change to molecule-controlplane wired here later) emits the
  # required check without spending CI cycles. Single-job pattern
  # matches e2e-api.yml — see that workflow's comment for why a
  # job-level `if: false` would block branch protection via the
  # SKIPPED-in-set bug.
  harness-replays:
    needs: detect-changes
    name: Harness Replays
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 30
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.run != 'true'
        run: |
          echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
          echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
          echo "::notice::Debug: ${{ needs.detect-changes.outputs.debug }}"
      - if: needs.detect-changes.outputs.run == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # Log what files were detected so future failures include the diff.
      - name: Log detected changes
        if: needs.detect-changes.outputs.run == 'true'
        run: |
          echo "::notice::detect-changes debug: ${{ needs.detect-changes.outputs.debug }}"
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # the plugin was dropped + Dockerfile.tenant no longer COPYs it.
      # Pre-clone manifest deps before docker compose builds the tenant
      # image (Task #173 followup — same pattern as
      # publish-workspace-server-image.yml's "Pre-clone manifest deps"
      # step).
      #
      # Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha
      # and tenant-beta from workspace-server/Dockerfile.tenant with
      # context=../.. (repo root). That Dockerfile expects
      # .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins}
      # to be present at build context root (post-#173 it COPYs from there
      # instead of running an in-image clone — the in-image clone failed
      # with "could not read Username for https://git.moleculesai.app"
      # because there's no auth path inside the build sandbox).
      #
      # Without this step harness-replays fails before any replay runs,
      # with `failed to calculate checksum of ref ...
      # "/.tenant-bundle-deps/plugins": not found`. Caught by run #892
      # (main, 2026-05-07T20:28:53Z) and run #964 (staging — same
      # symptom, different root cause: staging still has the in-image
      # clone path, hits the auth error directly).
      #
      # 2026-05-08 sub-finding (#192): the clone step ALSO fails when
      # any referenced workspace-template repo is private and the
      # AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read
      # access. Root cause: 5 of 9 workspace-template repos
      # (openclaw, codex, crewai, deepagents, gemini-cli) had been
      # marked private with no team grant. Resolution: flipped them
      # to public per `feedback_oss_first_repo_visibility_default`
      # (the OSS surface should be public). Layer-3 (customer-private +
      # marketplace third-party repos) tracked separately in
      # internal#102.
      #
      # Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
      # is the devops-engineer persona PAT, NOT the founder PAT (per
      # `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
      # embeds it as basic-auth for the duration of the clones and strips
      # .git directories — the token never enters the resulting image.
      - name: Pre-clone manifest deps
        if: needs.detect-changes.outputs.run == 'true'
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
            echo "::warning::AUTO_SYNC_TOKEN not set — using anonymous clone (repos are public per manifest.json OSS contract)"
          fi
          mkdir -p .tenant-bundle-deps
          # Strip JSON5 comments before jq parsing — Integration Tester appends
          # `// Triggered by ...` which breaks `jq` in clone-manifest.sh.
          sed '/^[[:space:]]*\/\//d' manifest.json > .manifest-stripped.json
          bash scripts/clone-manifest.sh \
            .manifest-stripped.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          # Sanity-check counts so a silent partial clone fails fast
          # instead of producing a half-empty image.
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
      - name: Install Python deps for replays
        # peer-discovery-404 (and future replays) eval Python against the
        # running tenant — importing workspace/a2a_client.py pulls in
        # httpx. tests/harness/requirements.txt holds just the HTTP-client
        # surface to keep CI install fast (~3s) vs the full
        # workspace/requirements.txt (~30s).
        if: needs.detect-changes.outputs.run == 'true'
        run: pip install -r tests/harness/requirements.txt
      - name: Run all replays against the harness
        # run-all-replays.sh: boot via up.sh → seed via seed.sh → run
        # every replays/*.sh → tear down via down.sh on EXIT (trap).
        # Non-zero exit on any replay failure.
        #
        # KEEP_UP=1: without this, the script's trap-on-EXIT tears
        # down containers immediately on failure, leaving the dump
        # step below with nothing to dump (verified on PR #2410's
        # first run — tenant became unhealthy, trap fired, dump
        # step saw empty containers). Keeping them up lets the
        # failure path collect tenant/cp-stub/cf-proxy logs. The
        # always-run "Force teardown" step does the actual cleanup.
        if: needs.detect-changes.outputs.run == 'true'
        working-directory: tests/harness
        env:
          KEEP_UP: "1"
        run: ./run-all-replays.sh
      - name: Dump compose logs on failure
        # SECRETS_ENCRYPTION_KEY: docker compose validates the entire compose
        # file even for read-only `logs` calls. up.sh generates a per-run key
        # and exports it to its OWN shell — this step runs in a fresh shell
        # that wouldn't see it, so without a placeholder the validate step
        # errors before logs print (verified against PR #2492's first run:
        # "required variable SECRETS_ENCRYPTION_KEY is missing a value").
        # A placeholder is fine — we're only reading log streams, not booting.
        if: failure() && needs.detect-changes.outputs.run == 'true'
        working-directory: tests/harness
        env:
          SECRETS_ENCRYPTION_KEY: dump-logs-placeholder
        run: |
          echo "=== docker compose ps ==="
          docker compose -f compose.yml ps || true
          echo "=== tenant-alpha logs ==="
          docker compose -f compose.yml logs tenant-alpha || true
          echo "=== tenant-beta logs ==="
          docker compose -f compose.yml logs tenant-beta || true
          echo "=== cp-stub logs ==="
          docker compose -f compose.yml logs cp-stub || true
          echo "=== cf-proxy logs ==="
          docker compose -f compose.yml logs cf-proxy || true
          echo "=== postgres-alpha logs (last 100) ==="
          docker compose -f compose.yml logs --tail 100 postgres-alpha || true
          echo "=== postgres-beta logs (last 100) ==="
          docker compose -f compose.yml logs --tail 100 postgres-beta || true
      - name: Force teardown
        # We pass KEEP_UP=1 to run-all-replays.sh so the dump step
        # above sees real containers — that means we own teardown
        # explicitly here. Always run.
        if: always() && needs.detect-changes.outputs.run == 'true'
        working-directory: tests/harness
        run: ./down.sh || true
--- a/.gitea/workflows/lint-continue-on-error-tracking.yml
+++ b/.gitea/workflows/lint-continue-on-error-tracking.yml
@ -1,120 +0,0 @@
 name: lint-continue-on-error-tracking
 # Tier 2e hard-gate lint (per internal#350) — every
 # `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a
 # `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines,
 # the referenced issue must be OPEN, and ≤14 days old.
 #
 # Why this exists
 # ---------------
 # `continue-on-error: true` on `platform-build` had been hiding
 # mc#664-class regressions for ~3 weeks before #656 surfaced them on
 # 2026-05-12. A 14-day cap on tracker age forces a review cycle and
 # surfaces mask-drift within at most 14 days of the original defect.
 # Each `continue-on-error: true` gets a paper trail — close or renew.
 #
 # How the gate works
 # ------------------
 # 1. Walk `.gitea/workflows/*.yml` via PyYAML's line-tracking loader
 #    (per `feedback_behavior_based_ast_gates`) and find every job
 #    whose `continue-on-error` evaluates truthy (`true` or string
 #    `"true"` — Gitea's evaluator coerces strings).
 # 2. For each, scan ±2 lines of the directive's source line for a
 #    `# mc#NNNN` or `# internal#NNNN` comment. Inline-trailing
 #    comments on the directive line count.
 # 3. For each tracker reference, GET the issue from the Gitea API.
 #    Validate: exists, `state == open`, `created_at` ≤ MAX_AGE_DAYS.
 # 4. Aggregate ALL violations (not short-circuit) and exit 1 if any.
 #
 # Triggers
 # --------
 # Runs on PR events (paths-filter on `.gitea/workflows/**`) AND on
 # a daily schedule. PR runs catch the violation at introduction time.
 # Schedule runs catch the AGE-EXPIRY class: a tracker that was ≤14d
 # old when the PR landed but is now 20d old, with the underlying
 # defect still unfixed. Per `feedback_chained_defects_in_never_tested_workflows`,
 # scheduled drift detection is the second half of the gate.
 #
 # Phase contract (RFC internal#219 §1 ladder)
 # -------------------------------------------
 # Lands at `continue-on-error: true` (Phase 3 — surface broken shapes
 # without blocking). The pre-existing `continue-on-error: true`
 # directives on `main` will all violate this lint at first
 # (intentional — they're the masked defects this lint exists to
 # surface). Each must be triaged: file a fresh tracker comment,
 # close-and-flip, or document the deliberate keep-mask in a fresh
 # 14-day-renewable tracker. After main is clean for 3 days,
 # follow-up PR flips this workflow's continue-on-error to false.
 # Tracking: internal#350.
 #
 # Cross-links
 # -----------
 # - internal#350 (the RFC that specs this lint)
 # - mc#664 (the empirical masked-3-weeks case)
 # - feedback_chained_defects_in_never_tested_workflows
 # - feedback_behavior_based_ast_gates
 # - feedback_strict_root_only_after_class_a
 #
 # Auth: DRIFT_BOT_TOKEN — same persona used by ci-required-drift.yml
 # (provisioned under internal#329). Auto-injected GITHUB_TOKEN is
 # insufficient because `internal#NNN` references cross repositories
 # (molecule-core → molecule-ai/internal).
 on:
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
      - '.gitea/workflows/**'
      - '.gitea/scripts/lint_continue_on_error_tracking.py'
      - 'tests/test_lint_continue_on_error_tracking.py'
  push:
    branches: [main, staging]
    paths:
      - '.gitea/workflows/**'
      - '.gitea/scripts/lint_continue_on_error_tracking.py'
  schedule:
    # Daily at 13:11 UTC — off-peak, prime-staggered from the other
    # Tier-2 lint schedules (ci-required-drift runs hourly :00).
    - cron: '11 13 * * *'
  workflow_dispatch:
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 permissions:
  contents: read
 concurrency:
  group: lint-coe-tracking-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 jobs:
  lint:
    name: lint-continue-on-error-tracking
    runs-on: ubuntu-latest
    timeout-minutes: 10
    # Phase 3 (RFC #219 §1): surface masked defects without blocking
    # PRs. Pre-existing continue-on-error: true directives on main
    # all violate this lint at first — intentional. Flip to false
    # follow-up after main is clean for 3 days. internal#350.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Install PyYAML
        run: python -m pip install --quiet 'PyYAML==6.0.2'
      - name: Run lint-continue-on-error-tracking
        env:
          GITEA_TOKEN: ${{ secrets.DRIFT_BOT_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          INTERNAL_REPO: molecule-ai/internal
          WORKFLOWS_DIR: .gitea/workflows
          MAX_AGE_DAYS: '14'
        run: python3 .gitea/scripts/lint_continue_on_error_tracking.py
      - name: Run lint-continue-on-error-tracking unit tests
        run: |
          python -m pip install --quiet pytest
          python3 -m pytest tests/test_lint_continue_on_error_tracking.py -v
--- a/.gitea/workflows/lint-curl-status-capture.yml
+++ b/.gitea/workflows/lint-curl-status-capture.yml
@ -1,104 +0,0 @@
 name: Lint curl status-code capture
 # Ported from .github/workflows/lint-curl-status-capture.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths and the lint scanner target .gitea/workflows/**.yml (the
 #     active Gitea workflow directory) instead of .github/workflows/**.yml
 #     (which the rest of this sweep is emptying out).
 #   - Self-skip path updated to the .gitea/ version of this file.
 #   - Dropped `merge_group:` trigger.
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Pins the workflow-bash anti-pattern that produced "HTTP 000000" on the
 # 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6:
 #
 #   HTTP_CODE=$(curl ... -w '%{http_code}' ... || echo "000")
 #
 # When curl exits non-zero (connection reset -> 56, --fail-with-body 4xx/5xx
 # -> 22), the `-w '%{http_code}'` already wrote a status to stdout — usually
 # "000" for connection failures or the actual code for HTTP errors. The
 # `|| echo "000"` then fires AND appends ANOTHER "000" to the captured
 # stdout, producing values like "000000" or "409000" that fail string
 # comparisons against "200" while looking superficially right.
 #
 # Same class of bug the synth-E2E §7c gate hit twice (PRs #2779/#2783 +
 # #2797). Memory: feedback_curl_status_capture_pollution.md.
 on:
  pull_request:
    paths: ['.gitea/workflows/**']
  push:
    branches: [main, staging]
    paths: ['.gitea/workflows/**']
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  scan:
    name: Scan workflows for curl status-capture pollution
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Find curl ... -w '%{http_code}' ... || echo "000" subshells
        run: |
          set -uo pipefail
          # Multi-line aware: look for `$(curl ... -w '%{http_code}' ... || echo "000")`
          # subshell where the entire command-substitution wraps a curl that
          # ends with `|| echo "000"`. Must distinguish from the SAFE shape
          # `$(cat tempfile 2>/dev/null || echo "000")` — `cat` with a missing
          # tempfile produces empty stdout, no pollution.
          python3 <<'PY'
          import os, re, sys, glob
          BAD_FILES = []
          # Match the buggy substitution across newlines: $(curl ... -w '%{http_code}' ... || echo "000")
          # The `\\n` is the bash line-continuation that lets curl flags span lines.
          # We collapse continuation lines first, then look for the single-line bad pattern.
          PATTERN = re.compile(
              r'\$\(\s*curl\b[^)]*-w\s*[\'"]%\{http_code\}[\'"][^)]*\|\|\s*echo\s+"000"\s*\)',
              re.DOTALL,
          )
          # Self-skip: this lint workflow contains the literal anti-pattern in
          # its own docstring — that's intentional, not a bug.
          SELF = ".gitea/workflows/lint-curl-status-capture.yml"
          for f in sorted(glob.glob(".gitea/workflows/*.yml")):
              if f == SELF:
                  continue
              with open(f) as fh:
                  content = fh.read()
              # Collapse bash line-continuations (\\\n + leading whitespace)
              # into a single logical line so the regex can see the full
              # curl invocation as one chunk.
              flat = re.sub(r'\\\s*\n\s*', ' ', content)
              for m in PATTERN.finditer(flat):
                  BAD_FILES.append((f, m.group(0)[:120]))
          if not BAD_FILES:
              print("OK No curl-status-capture pollution patterns detected")
              sys.exit(0)
          print(f"::error::Found {len(BAD_FILES)} curl-status-capture pollution site(s):")
          for f, snippet in BAD_FILES:
              print(f"::error file={f}::Curl status-capture pollution: '|| echo \"000\"' inside a $(curl ... -w '%{{http_code}}' ...) subshell. On non-2xx or connection failure, curl's -w writes a status, then exits non-zero, then the || echo appends another '000' — producing 'HTTP 000000' or '409000' that fails comparisons silently. Fix: route -w into a tempfile so the exit code can't pollute stdout. See memory feedback_curl_status_capture_pollution.md.")
              print(f"   matched: {snippet}...")
          print()
          print("Fix template:")
          print('  set +e')
          print('  curl ... -w \'%{http_code}\' >code.txt 2>/dev/null')
          print('  set -e')
          print('  HTTP_CODE=$(cat code.txt 2>/dev/null)')
          print('  [ -z "$HTTP_CODE" ] && HTTP_CODE="000"')
          sys.exit(1)
          PY
--- a/.gitea/workflows/lint-mask-pr-atomicity.yml
+++ b/.gitea/workflows/lint-mask-pr-atomicity.yml
@ -1,132 +0,0 @@
 name: lint-mask-pr-atomicity
 # Tier 2d hard-gate lint (per internal#350) — blocks PRs that touch
 # `.gitea/workflows/ci.yml` and modify ONLY ONE of {continue-on-error,
 # all-required.sentinel.needs} without a `Paired: #NNN` reference in
 # the PR body or in a commit message.
 #
 # Why this exists
 # ---------------
 # PR#665 (interim `continue-on-error: true` on `platform-build`) and
 # PR#668 (sentinel-`needs` demotion of the same job) were designed as a
 # pair but merged solo — #665 landed at 04:47Z 2026-05-12, #668 was
 # still open at 05:07Z when the main-red watchdog (#674) fired. Result:
 # ~20 minutes of `main` red and a cascade of false-positives on
 # unrelated PRs. This lint structurally prevents that class.
 #
 # How the gate works
 # ------------------
 # 1. The workflow runs on every PR whose diff touches ci.yml (paths
 #    filter). It is NOT a required check on `main` because the rule is
 #    diff-based — running it on PRs that don't touch ci.yml would
 #    produce a `pending` status forever (per
 #    `feedback_path_filtered_workflow_cant_be_required`).
 # 2. The script reads `BASE_SHA:ci.yml` and `HEAD_SHA:ci.yml`, parses
 #    both via PyYAML AST (per `feedback_behavior_based_ast_gates` — no
 #    grep, no regex on the raw text — so a YAML-shape refactor still
 #    detects).
 # 3. Walks `jobs.*.continue-on-error` on each side; flags any value
 #    diff. Reads `jobs.all-required.needs` on each side; flags any
 #    set diff (order-insensitive — `needs:` is engine-unordered).
 # 4. If both predicates fired → atomic, OK. If neither → no risk, OK.
 #    If exactly one fired → require `Paired: #NNN` in PR body OR in
 #    any commit message between base..head; else fail.
 #
 # Phase contract (RFC internal#219 §1 ladder)
 # -------------------------------------------
 # This workflow lands at `continue-on-error: true` (Phase 3 — surface
 # regressions without blocking PRs while the rule beds in).
 # Follow-up PR flips to `false` once we have ≥3 days of clean runs on
 # `main` and no false-positives. Tracking issue: internal#350.
 #
 # Cross-links
 # -----------
 # - internal#350 (the RFC that specs this lint)
 # - PR#665 / PR#668 (the empirical split-pair)
 # - mc#664 (the main-red incident the split caused)
 # - feedback_strict_root_only_after_class_a
 # - feedback_behavior_based_ast_gates
 #
 # Auth: only needs the auto-injected GITHUB_TOKEN (read-only, repo
 # scope). No DRIFT_BOT_TOKEN needed — Tier 2d does NOT call
 # branch_protections (Tier 2g/f do).
 on:
  pull_request:
    types: [opened, synchronize, reopened, edited]
    # `edited` is included because the rule depends on PR_BODY: a user
    # may add `Paired: #NNN` after first push to satisfy the lint. The
    # rerun on `edited` lets the PR turn green without an empty
    # commit. Gitea 1.22.6 fires `edited` on body changes — verified
    # via gitea-source/models/issues/pull_list.go::triggerNewPRWebhook.
    paths:
      - '.gitea/workflows/ci.yml'
      - '.gitea/scripts/lint_mask_pr_atomicity.py'
      - '.gitea/workflows/lint-mask-pr-atomicity.yml'
      - 'tests/test_lint_mask_pr_atomicity.py'
 env:
  # Belt-and-suspenders against the runner-default trap
  # (feedback_act_runner_github_server_url). Runners are configured
  # with this env via /opt/molecule/runners/config.yaml, but pinning
  # at the workflow level protects against a runner regenerated
  # without the config file.
  GITHUB_SERVER_URL: https://git.moleculesai.app
 permissions:
  contents: read
  pull-requests: read
 # Per-PR concurrency — re-pushes cancel previous runs to keep the
 # queue short. The lint is cheap (one git show + log + a YAML parse).
 concurrency:
  group: lint-mask-pr-atomicity-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 jobs:
  scan:
    name: lint-mask-pr-atomicity
    runs-on: ubuntu-latest
    timeout-minutes: 5
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking
    # PRs. Follow-up PR flips this to `false` once recent runs on main
    # are confirmed clean (eat-our-own-dogfood discipline mirrors
    # PR#673's same-shape comment). Tracking: internal#350.
    continue-on-error: true
    steps:
      - name: Check out PR head with full history (need base SHA blobs)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # `git show <base-sha>:<path>` needs the base SHA's blobs.
          # Shallow=1 would miss it. Same rationale as PR#673 and
          # check-migration-collisions.yml.
          fetch-depth: 0
      - name: Set up Python (PyYAML for AST parsing)
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Install PyYAML
        # Same pin as ci-required-drift.yml + the rest of the Tier 2
        # lint family — keep runner-cache hits uniform.
        run: python -m pip install --quiet 'PyYAML==6.0.2'
      - name: Ensure base ref is reachable locally
        # fetch-depth=0 usually pulls the base too, but explicit-fetch
        # is cheap insurance against runner-version drift (matches the
        # comment in check-migration-collisions.yml and PR#673).
        run: |
          git fetch origin "${{ github.event.pull_request.base.ref }}" || true
      - name: Run lint-mask-pr-atomicity
        env:
          BASE_SHA: ${{ github.event.pull_request.base.sha }}
          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
          # PR body — the script greps for `Paired: #NNN`.
          PR_BODY: ${{ github.event.pull_request.body }}
          CI_WORKFLOW_PATH: .gitea/workflows/ci.yml
          SENTINEL_JOB_KEY: all-required
        run: python3 .gitea/scripts/lint_mask_pr_atomicity.py
      - name: Run lint-mask-pr-atomicity unit tests
        # Run the test suite in-CI so the lint's own behaviour is
        # verified on every change. Matches lint-workflow-yaml.yml.
        run: |
          python -m pip install --quiet pytest
          python3 -m pytest tests/test_lint_mask_pr_atomicity.py -v
--- a/.gitea/workflows/lint-pre-flip-continue-on-error.yml
+++ b/.gitea/workflows/lint-pre-flip-continue-on-error.yml
@ -1,141 +0,0 @@
 name: Lint pre-flip continue-on-error
 # Pre-merge gate: blocks PRs that flip `continue-on-error: true → false`
 # on any job in `.gitea/workflows/*.yml` WITHOUT proof that the affected
 # job's recent runs on the target branch (PR base) are actually green.
 #
 # Empirical class: PR #656 / mc#664. PR #656 (RFC internal#219 Phase 4)
 # flipped 5 platform-build-class jobs `continue-on-error: true → false`
 # on the basis of a "verified green on main via combined-status check".
 # But that "green" was the LIE the prior `continue-on-error: true`
 # produced: Gitea Quirk #10 (internal#342 + dup #287) — a failed step
 # inside a `continue-on-error: true` job rolls up to a `success`
 # job-level status. The precondition the PR claimed to verify was
 # structurally fooled by the bug being flipped.
 #
 # mc#664 captured the surfaced defects (2 mutually-masked regressions):
 #   - Class 1: sqlmock helper drift since 2f36bb9a (24 days old)
 #   - Class 2: OFFSEC-001 contract collision since 7d1a189f (1 day old)
 #
 # Codified 04:35Z as hongming-pc2 charter §SOP-N rule (e)
 # "run-log-grep-before-flip" — now structurally enforced here at PR
 # time, ahead of merge.
 #
 # How the gate works:
 #   1. Read every `.gitea/workflows/*.yml` at the PR base SHA AND at
 #      the PR head SHA via `git show <sha>:<path>` (no checkout
 #      needed).
 #   2. Parse both sides via PyYAML AST (NOT grep — per
 #      `feedback_behavior_based_ast_gates`). Walk `jobs.<key>.
 #      continue-on-error` on each side. A flip is base=true,
 #      head=false.
 #   3. For each flipped job, render the commit-status context as
 #      `"{workflow.name} / {job.name or job.key} (push)"` — that's
 #      how Gitea Actions emits the per-context status on `main`/
 #      `staging` runs.
 #   4. Pull last 5 commits on the PR base branch, fetch combined
 #      commit-status per commit, scan for the target context. For
 #      each match, fetch the run log via the web-UI route
 #      `{server_url}/{repo}/actions/runs/{run_id}/jobs/{job_idx}/logs`
 #      (per `reference_gitea_actions_log_fetch` —
 #      Gitea 1.22.6 lacks REST `/actions/runs/*`; web-UI is the
 #      only working path, see also
 #      `reference_gitea_1_22_6_lacks_rest_rerun_endpoints`).
 #   5. Grep each log for `--- FAIL`, `FAIL\s`, `::error::`. If
 #      the status is `success` but the log shows any of these,
 #      the job was masked. Block the PR with `::error::`.
 #
 # Graceful-degrade contract (per task halt-conditions):
 #   - Log fetch 404 (act_runner pruned the log, transient outage):
 #     emit `::warning::` "log unavailable" — does NOT block.
 #   - Zero recent runs of the flipped job's context on the base
 #     branch (newly added workflow): emit `::warning::` "no run
 #     history to verify" — allow the flip. Chicken-and-egg
 #     exemption.
 #   - YAML parse error in one of the workflow files: warn-only,
 #     don't block — the YAML lint workflows catch this separately.
 #
 # Cross-links: PR#656, mc#664, PR#665 (interim re-mask),
 # Quirk #10 (internal#342 + dup #287), hongming-pc2 charter
 # §SOP-N rule (e), feedback_strict_root_only_after_class_a,
 # feedback_no_shared_persona_token_use.
 #
 # Phase contract (RFC internal#219 §1 ladder):
 #   - This workflow lands at `continue-on-error: true` (Phase 3 —
 #     surface defects without blocking). Follow-up PR flips it to
 #     `false` ONLY after this workflow's own recent runs on `main`
 #     are confirmed clean — exactly the discipline the workflow
 #     itself enforces. Eat your own dogfood.
 on:
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
      - '.gitea/workflows/**'
      - '.gitea/scripts/lint_pre_flip_continue_on_error.py'
      - '.gitea/workflows/lint-pre-flip-continue-on-error.yml'
 env:
  # Per `feedback_act_runner_github_server_url` — without this,
  # actions/checkout and friends default to github.com → break.
  GITHUB_SERVER_URL: https://git.moleculesai.app
 permissions:
  contents: read
  # Need read on the API to pull combined commit-status + commit list
  # for the base branch. The job-log fetch uses the same token via
  # the web-UI route (Gitea 1.22.6 accepts `Authorization: token ...`
  # there).
  pull-requests: read
 concurrency:
  group: lint-pre-flip-coe-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: true
 jobs:
  scan:
    name: Verify continue-on-error flips have run-log proof
    runs-on: ubuntu-latest
    timeout-minutes: 8
    # Phase 3 (RFC internal#219 §1): surface broken flips without blocking
    # the PR yet. Follow-up flips this to `false` once the workflow itself
    # has clean recent runs on main. mc#664 interim — remove when CoE→false.
    continue-on-error: true  # mc#664
    steps:
      - name: Check out PR head (full history for base-SHA access)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # `git show <base-sha>:<path>` needs the base SHA's blobs.
          # Shallow=1 would miss it. Same rationale as
          # check-migration-collisions.yml.
          fetch-depth: 0
      - name: Set up Python (PyYAML for AST parsing)
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Install PyYAML
        # Same pin as ci-required-drift.yml — keep dependencies
        # uniform so a Gitea runner cache hits across both jobs.
        run: python -m pip install --quiet 'PyYAML==6.0.2'
      - name: Ensure base ref is reachable locally
        # `actions/checkout@v6 fetch-depth=0` usually pulls the base
        # too, but explicit-fetch is cheap insurance against the
        # form-of-ref differences across Gitea runner versions
        # (mirrors the comment in check-migration-collisions.yml).
        run: |
          git fetch origin "${{ github.event.pull_request.base.ref }}" || true
      - name: Run lint
        env:
          # Auto-injected by Gitea Actions; sufficient scope for
          # combined-status + commit-list + log fetch via web-UI
          # route. NO repo-admin needed (unlike the
          # branch_protections endpoint).
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          BASE_REF: ${{ github.event.pull_request.base.ref }}
          BASE_SHA: ${{ github.event.pull_request.base.sha }}
          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
          # Last 5 commits on the base branch is the spec default.
          RECENT_COMMITS_N: '5'
        run: python3 .gitea/scripts/lint_pre_flip_continue_on_error.py
--- a/.gitea/workflows/lint-required-no-paths.yml
+++ b/.gitea/workflows/lint-required-no-paths.yml
@ -1,96 +0,0 @@
 # lint-required-no-paths — structural enforcement of
 # `feedback_path_filtered_workflow_cant_be_required`.
 #
 # Fails the PR if ANY workflow whose status-check context appears in
 # `branch_protections/main.status_check_contexts` carries a
 # `paths:` or `paths-ignore:` filter in its `on:` block.
 #
 # Why this exists:
 #   A required-check workflow with a paths filter silently degrades the
 #   merge gate. If a PR's diff doesn't touch the filter, the workflow
 #   never fires; Gitea (1.22.6) reports the required context as
 #   `pending` (NOT `skipped == success`), so the PR cannot merge. For a
 #   docs-only PR against `paths: ['**.go']`, the PR is wedged forever.
 #
 #   Previously prevented only by reviewer vigilance + the saved memory
 #   `feedback_path_filtered_workflow_cant_be_required`. This workflow
 #   makes it a hard CI gate.
 #
 # Forward-compat scope:
 #   Today (2026-05-11) molecule-core/main protects 3 contexts:
 #     - "Secret scan / Scan diff for credential-shaped strings (pull_request)"
 #     - "sop-tier-check / tier-check (pull_request)"
 #     - "CI / all-required (pull_request)"
 #   Per RFC#324 Step 2 the required-list expands to ~5 contexts
 #   (qa-review, security-review added). Each new required context's
 #   workflow must remain unconditional. This lint pins that contract.
 #
 # Meta-required-check:
 #   This workflow ITSELF deliberately has NO `paths:` filter on its `on:`
 #   block — otherwise a paths-non-matching PR could bypass the check.
 #   Self-evident from this file: only `pull_request` types + no paths.
 #
 # Auth:
 #   `GET /repos/.../branch_protections/{branch}` requires repo-admin
 #   role in Gitea 1.22.6. The workflow-default `GITHUB_TOKEN` is
 #   non-admin (read-only), so we re-use `DRIFT_BOT_TOKEN` (same persona
 #   that powers `ci-required-drift.yml` — verified working there).
 #   If `DRIFT_BOT_TOKEN` becomes unavailable, the script exits 0 with a
 #   loud `::error::` rather than red-X every PR — token-scope issues
 #   should be fixed at the token, not surfaced as a gate failure on
 #   every unrelated PR.
 #
 # Behavior-based gate per `feedback_behavior_based_ast_gates`:
 #   YAML AST walk (PyYAML), NOT grep. Workflow renames, formatting
 #   changes (block-scalar vs flow-style), or moving `paths:` between
 #   `pull_request:` and `pull_request_target:` all still detect.
 #
 # IMPORTANT — Gitea 1.22.6 parser quirk per
 # `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
 # `inputs:` block to `workflow_dispatch:` — Gitea 1.22.6 rejects the
 # entire workflow as "unknown on type" and it registers for ZERO events.
 name: lint-required-no-paths
 on:
  pull_request:
    types: [opened, synchronize, reopened]
  workflow_dispatch:
 # Read protection + read local YAML. No writes.
 permissions:
  contents: read
 # Only one in-flight run per PR — re-pushes cancel the previous run to
 # keep the queue short. Required-list reads are cheap (one GET); the
 # cancellation is just hygiene.
 concurrency:
  group: lint-required-no-paths-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 jobs:
  lint:
    name: lint-required-no-paths
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - name: Check out repo (we read the workflow YAML files locally)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - name: Set up Python (PyYAML for AST parsing)
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Install PyYAML
        run: python -m pip install --quiet 'PyYAML==6.0.2'
      - name: Run lint-required-no-paths
        env:
          # DRIFT_BOT_TOKEN is owned by mc-drift-bot, a least-privilege
          # Gitea persona with repo-admin role for branch_protections
          # read. Same secret used by ci-required-drift.yml — see that
          # workflow's header for provisioning trail (internal#329).
          GITEA_TOKEN: ${{ secrets.DRIFT_BOT_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          BRANCH: main
          WORKFLOWS_DIR: .gitea/workflows
        run: python3 .gitea/scripts/lint-required-no-paths.py
--- a/.gitea/workflows/lint-workflow-yaml.yml
+++ b/.gitea/workflows/lint-workflow-yaml.yml
@ -1,75 +0,0 @@
 name: Lint workflow YAML (Gitea-1.22.6-hostile shapes)
 # Tier-2 hard-gate lint (RFC internal#219 §1, charter §SOP-N rule (m)).
 # Catches six Gitea-1.22.6-hostile workflow-YAML shapes BEFORE they reach
 # `main`. Each rule maps to a documented incident in saved memory:
 #
 #   1. workflow_dispatch.inputs   — feedback_gitea_workflow_dispatch_inputs_unsupported
 #                                   (2026-05-11 PyPI freeze 24h)
 #   2. on: workflow_run           — task #81 (Gitea 1.22.6 lacks the event)
 #   3. name: containing "/"       — breaks status-context tokenization
 #   4. cross-file name collision  — status-reaper rev1 fail-loud class
 #   5. cross-repo uses: org/r/p@r — feedback_gitea_cross_repo_uses_blocked
 #                                   (DEFAULT_ACTIONS_URL=github → 404)
 #   6. (WARN) api.github.com refs — feedback_act_runner_github_server_url
 #                                   without workflow-level GITHUB_SERVER_URL
 #
 # Empirical history this hardens against:
 #   - status-reaper rev1 caught rule-4 (name-collision) class
 #   - sop-tier-refire DOA'd on rule-2 (workflow_run partial)
 #   - #319 bootstrap-paradox (chained-defect class, related)
 #   - internal#329 dispatcher race (adjacent)
 #   - 2026-05-11 publish-runtime: rule-1, 24h PyPI freeze
 #
 # Triggers:
 #   - pull_request: pre-merge gate — block hostile shapes before they land
 #   - push: post-merge regression detection — catch direct-to-main edits
 #
 # Per RFC internal#219 §1 contract: continue-on-error: true during the
 # surface-broken-shapes phase. Follow-up PR flips off after surfaced
 # defects are triaged. The push-trigger ensures we catch regressions
 # even if the pull_request gate is bypassed by branch-protection drift.
 on:
  pull_request:
    paths:
      - '.gitea/workflows/**'
      - '.gitea/scripts/lint-workflow-yaml.py'
      - 'tests/test_lint_workflow_yaml.py'
  push:
    branches: [main, staging]
    paths:
      - '.gitea/workflows/**'
      - '.gitea/scripts/lint-workflow-yaml.py'
      - 'tests/test_lint_workflow_yaml.py'
 # Belt-and-suspenders against runner default
 # (feedback_act_runner_github_server_url).
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  lint:
    name: Lint workflow YAML for Gitea-1.22.6-hostile shapes
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs.
    # Follow-up PR flips this off after the 4 existing-on-main rule-2
    # (workflow_run) violations are migrated to a supported trigger.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
        with:
          python-version: '3.11'
      - name: Install PyYAML
        run: pip install --quiet 'PyYAML>=6.0'
      - name: Lint .gitea/workflows/*.yml
        run: python3 .gitea/scripts/lint-workflow-yaml.py
      - name: Run lint-workflow-yaml unit tests
        run: |
          pip install --quiet pytest
          python3 -m pytest tests/test_lint_workflow_yaml.py -v
--- a/.gitea/workflows/main-red-watchdog.yml
+++ b/.gitea/workflows/main-red-watchdog.yml
@ -1,104 +0,0 @@
 # main-red-watchdog — hourly sentinel for post-merge CI red on `main`.
 #
 # RFC: hongming "main NEVER goes red" directive, Option C of the four-
 # option ladder (B = auto-revert is explicitly rejected per
 # `feedback_no_such_thing_as_flakes` + `feedback_fix_root_not_symptom`).
 # Tracking issue: molecule-core#420.
 #
 # What it does:
 #   1. GET branches/main → HEAD SHA
 #   2. GET commits/{SHA}/status → combined status
 #   3. If combined is `failure` (or any individual status is `failure`):
 #      open or PATCH an idempotent `[main-red] {repo}: {SHA[:10]}` issue
 #      with each failed context + target_url + description.
 #   4. If combined is `success` and a prior `[main-red] ...` issue exists,
 #      close it with a "main returned to green at SHA ..." comment.
 #   5. Emit a Loki-shaped JSON line via `logger -t main-red-watchdog` for
 #      `reference_obs_stack_phase1` ingestion via Vector.
 #
 # What it does NOT do:
 #   - Auto-revert anything. Option B is rejected by directive.
 #   - Mutate branch protection. (See AGENTS.md boundaries.)
 #   - Fail the workflow on red. The issue IS the alarm — failing the
 #     watchdog would create a silent-loop where a flake in the watchdog
 #     itself hides actual main-red signal. Exit 0 unless api() raises
 #     ApiError (transient Gitea outage → fail loudly per
 #     `feedback_api_helper_must_raise_not_return_dict`).
 #
 # Pattern source: molecule-controlplane `0adf2098`'s ci-required-drift.yml
 # (just merged 2026-05-11). Same shape (cron + dispatch + sidecar Python +
 # idempotent-by-title issue), simpler scope (1 source, not 3).
 name: main-red-watchdog
 # IMPORTANT — Gitea 1.22.6 parser quirk per
 # `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
 # `inputs:` block here. Gitea 1.22.6 rejects the whole workflow as
 # "unknown on type" when `workflow_dispatch.inputs.X` is present. Revisit
 # when Gitea ≥ 1.23 is fleet-wide.
 on:
  # SCHEDULE RE-ENABLED 2026-05-12 rev3 — interim disable (mc#645) reverted alongside
  # status-reaper rev3 (widen-window). Job-level timeout-minutes raised 5 → 15 below
  # to absorb runner-saturation latency without spurious cancels (the original cascade
  # cause). If runner-saturation root persists, the dedicated-runner-label split
  # remains the structural next step (tracked separately).
  schedule:
    # Hourly at :05 — task spec calls for "off-zero" (`5 * * * *`),
    # offset from :17 (ci-required-drift) and :00 (peak cron load).
    - cron: '5 * * * *'
  workflow_dispatch:
 # Read commit status + branch ref + issues; write issues (open/PATCH/close).
 permissions:
  contents: read
  issues: write
 # Workflow-scoped serialisation — two simultaneous runs would race on the
 # `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
 # POSTs can produce duplicates before the title search dedup wins.
 concurrency:
  group: main-red-watchdog
  cancel-in-progress: false
 jobs:
  watchdog:
    runs-on: ubuntu-latest
    # rev3 (2026-05-12, mc#645 revert): raised 5 → 15 to absorb runner-saturation
    # latency. Original 5min cap was producing 124-style cancels under load,
    # which fed the very `[main-red]` issues this workflow files (self-poisoning).
    # 15min is still well below Gitea-default 6h job ceiling; if a real hang
    # occurs the issue-file path is still the alarm surface.
    timeout-minutes: 15
    steps:
      - name: Check out repo (script lives at .gitea/scripts/)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - name: Set up Python (stdlib only — no PyYAML needed here)
        # The script uses stdlib urllib + json. No PyYAML required (CP's
        # drift detector needs it for AST parsing; we don't). Pin to the
        # same 3.12 hermetic interpreter CP uses so the test/runtime
        # versions stay aligned across watchdog suites.
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Run main-red watchdog
        env:
          # GITEA_TOKEN reads commit status + writes issues. Falls back
          # to the auto-injected GITHUB_TOKEN if the org-level secret
          # isn't set (transitional repos), matching the same pattern
          # used by deploy-pipeline.yml + ci-required-drift.yml.
          GITEA_TOKEN: ${{ secrets.GITEA_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          # Branch under watch. `main` per directive; staging not
          # included here — staging green is a separate gate
          # (`feedback_staging_e2e_merge_gate`).
          WATCH_BRANCH: 'main'
          # Issue label applied on file/open. `tier:high` exists in the
          # molecule-core label set (verified 2026-05-11, label id 9).
          # Rationale for high: main red blocks the promotion train and
          # poisons every PR's auto-rebase base; treat as a fire even
          # if intermittent.
          RED_LABEL: 'tier:high'
        run: python3 .gitea/scripts/main-red-watchdog.py
--- a/.gitea/workflows/publish-canvas-image.yml
+++ b/.gitea/workflows/publish-canvas-image.yml
@ -1,146 +0,0 @@
 name: publish-canvas-image
 # Ported from .github/workflows/publish-canvas-image.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - **Open question for review**: this workflow pushes the canvas
 #     image to `ghcr.io`. GHCR was retired during the 2026-05-06
 #     Gitea migration in favor of ECR (per staging-verify.yml header
 #     notes). The image may not be consumable post-migration. Two
 #     options for follow-up: (a) retarget to
 #     `153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas`,
 #     or (b) retire this workflow entirely and route canvas deploys
 #     via the operator-host build path. tier:low + continue-on-error
 #     means failed pushes do not block PRs.
 #
 # Builds and pushes the canvas Docker image to GHCR whenever a commit lands
 # on main that touches canvas code. Previously canvas changes were visible in
 # CI (npm run build passed) but the live container was never updated —
 # operators had to manually run `docker compose build canvas` each time.
 #
 # Mirror of publish-platform-image.yml, adapted for the Next.js canvas layer.
 # See that workflow for inline notes on macOS Keychain isolation and QEMU.
 on:
  push:
    branches: [main]
    paths:
      # Only rebuild when canvas source changes — saves GHA minutes on
      # platform-only / docs-only / MCP-only merges.
      - 'canvas/**'
      - '.gitea/workflows/publish-canvas-image.yml'
  # NOTE (Gitea port): the original GitHub workflow had a
  # `workflow_dispatch:` manual trigger for the
  # non-canvas-merge-but-need-fresh-image scenario. Dropped in the
  # Gitea port (1.22.6 parser-finicky). Manual rebuilds require
  # pushing an empty commit to canvas/ or running the operator-host
  # build directly.
 permissions:
  contents: read
  packages: write  # required to push to ghcr.io/${{ github.repository_owner }}/*
 env:
  IMAGE_NAME: ghcr.io/molecule-ai/canvas
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  build-and-push:
    name: Build & push canvas image
    # REVERTED (infra/revert-docker-runner-label): `runs-on: ubuntu-latest` restored.
    # The `docker` label is not registered on any act_runner. `runs-on: [ubuntu-latest, docker]`
    # causes jobs to queue indefinitely with zero eligible runners — strictly worse than the
    # pre-#599 coin-flip (50% success rate). Once the `docker` label is registered on
    # ≥2 runners, re-apply the fix from #599 (infra/docker-runner-label).
    # See issue #576 + infra-lead pulse ~00:30Z.
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Log in to GHCR
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
      # Health check: verify Docker daemon is accessible before attempting any
      # build steps. This fails loudly at step 1 when the runner's docker.sock
      # is inaccessible rather than silently continuing to the build step
      # where docker build fails deep in ECR auth with a cryptic error.
      - name: Verify Docker daemon access
        run: |
          set -euo pipefail
          echo "::group::Docker daemon health check"
          echo "Runner: ${HOSTNAME:-unknown}"
          docker info 2>&1 | head -5 || {
            echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
            echo "::error::Runner: ${HOSTNAME:-unknown}"
            echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
            exit 1
          }
          echo "Docker daemon OK"
          echo "::endgroup::"
      - name: Compute tags
        id: tags
        shell: bash
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      - name: Resolve build args
        id: build_args
        # Priority: workflow_dispatch input > repo secret > hardcoded default.
        # NEXT_PUBLIC_* env vars are baked into the JS bundle at build time by
        # Next.js — they cannot be changed at runtime without a full rebuild.
        # For local docker-compose deployments the defaults (localhost:8080)
        # work as-is; production deployments should set CANVAS_PLATFORM_URL
        # and CANVAS_WS_URL as repository secrets.
        #
        # Inputs are passed via env vars (not direct ${{ }} interpolation) to
        # prevent shell injection from workflow_dispatch string inputs.
        shell: bash
        env:
          INPUT_PLATFORM_URL: ${{ github.event.inputs.platform_url }}
          SECRET_PLATFORM_URL: ${{ secrets.CANVAS_PLATFORM_URL }}
          INPUT_WS_URL: ${{ github.event.inputs.ws_url }}
          SECRET_WS_URL: ${{ secrets.CANVAS_WS_URL }}
        run: |
          PLATFORM_URL="${INPUT_PLATFORM_URL:-${SECRET_PLATFORM_URL:-http://localhost:8080}}"
          WS_URL="${INPUT_WS_URL:-${SECRET_WS_URL:-ws://localhost:8080/ws}}"
          echo "platform_url=${PLATFORM_URL}" >> "$GITHUB_OUTPUT"
          echo "ws_url=${WS_URL}" >> "$GITHUB_OUTPUT"
      - name: Build & push canvas image to GHCR
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: ./canvas
          file: ./canvas/Dockerfile
          platforms: linux/amd64
          push: true
          build-args: |
            NEXT_PUBLIC_PLATFORM_URL=${{ steps.build_args.outputs.platform_url }}
            NEXT_PUBLIC_WS_URL=${{ steps.build_args.outputs.ws_url }}
          tags: |
            ${{ env.IMAGE_NAME }}:latest
            ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow)
--- a/.gitea/workflows/publish-runtime-autobump.yml
+++ b/.gitea/workflows/publish-runtime-autobump.yml
@ -1,149 +0,0 @@
 name: publish-runtime-autobump
 # Auto-bump-on-workspace-edit half of the publish pipeline.
 #
 # Why this file exists (issue #351):
 #   Gitea Actions does not correctly disambiguate `paths:` from `tags:`
 #   when both are bundled under a single `on.push` key. The result is
 #   that tag pushes get filtered out and `publish-runtime.yml` never
 #   fires — `action_run` rows: 0. This was unnoticed pre-2026-05-11
 #   because PYPI_TOKEN was absent (publishes would have failed anyway).
 #
 #   Split design:
 #     - publish-runtime.yml         : on.push.tags only        (the publisher)
 #     - publish-runtime-autobump.yml: on.push.branches+paths   (this file — the version-bumper)
 #
 #   This file computes the next version from PyPI's latest, pushes a
 #   `runtime-v$VERSION` tag, and exits. The tag push then triggers
 #   publish-runtime.yml via its tags-only trigger.
 #
 # Concurrency: shares the `publish-runtime` group with publish-runtime.yml
 # so concurrent workspace pushes serialize at the bump step. Without
 # this, two pushes minutes apart could both read PyPI latest=0.1.129
 # and try to tag 0.1.130 simultaneously, only one of which would land.
 on:
  # Run on PR pushes to post a success status so Gitea can merge the PR.
  # All steps use continue-on-error: true so operational failures
  # (PyPI unreachable, DISPATCH_TOKEN missing) do not block merge.
  pull_request:
    paths:
      - "workspace/**"
  # Bump-and-tag on main/staging push (the actual operational trigger).
  push:
    branches:
      - main
      - staging
    paths:
      - "workspace/**"
  # Manual dispatch — useful when Gitea Actions API (/actions/*) is
  # unreachable (e.g. act_runner 404 on Gitea 1.22.6) and we cannot
  # re-trigger via curl.
  workflow_dispatch:
 permissions:
  contents: write  # required to push tags back
 concurrency:
  group: publish-runtime
  cancel-in-progress: false
 jobs:
  # PR-validation path: always succeeds so Gitea can merge workflow-only PRs.
  # Operational failures (PyPI unreachable, missing DISPATCH_TOKEN) are
  # surfaced via continue-on-error: true rather than blocking the merge.
  # The actual bump work happens on the main/staging push after merge.
  pr-validate:
    runs-on: ubuntu-latest
    continue-on-error: true  # do not block PR merge on operational failures
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 1
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
      - name: Validate PyPI connectivity (best-effort)
        run: |
          set -eu
          echo "=== Checking PyPI accessibility ==="
          LATEST=$(curl -fsS --retry 3 --max-time 10 \
            https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
            | python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])" \
            || echo "PyPI unreachable (non-blocking for PR validation)")
          echo "Latest: ${LATEST:-unknown}"
  # Actual bump-and-tag: runs on main/staging pushes, posts real success/failure.
  # No continue-on-error — operational failures here trip the main-red
  # watchdog, which is the desired signal for infrastructure degradation.
  bump-and-tag:
    runs-on: ubuntu-latest
    # Only fire on push events (main/staging after PR merge). Pull_request
    # events are handled by pr-validate above; we do NOT bump on every
    # push-synchronize because that would race with the PR head.
    #
    # NOTE: the prior condition `github.event.pull_request.base.ref == ''`
    # was broken — on a PR-merge push in Gitea Actions, the pull_request
    # context is still attached (base.ref='main'), so the condition always
    # evaluated to false and bump-and-tag was permanently skipped.
    if: github.event_name == 'push'
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 1
      - name: Fetch tags for collision check
        run: git fetch origin --tags --depth=1
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
      - name: Compute next version from PyPI latest
        id: bump
        run: |
          set -eu
          LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
            | python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
          MAJOR=$(echo "$LATEST" | cut -d. -f1)
          MINOR=$(echo "$LATEST" | cut -d. -f2)
          PATCH=$(echo "$LATEST" | cut -d. -f3)
          VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
          echo "PyPI latest=$LATEST -> next=$VERSION"
          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+$'; then
            echo "::error::computed version $VERSION does not match PEP 440 X.Y.Z"
            exit 1
          fi
          if git tag --list | grep -qx "runtime-v$VERSION"; then
            echo "::error::tag runtime-v$VERSION already exists in this repo. Manual intervention required (PyPI and Gitea tag history are out of sync)."
            exit 1
          fi
          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
      - name: Push runtime-v$VERSION tag
        env:
          DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
          VERSION: ${{ steps.bump.outputs.version }}
          GITEA_URL: https://git.moleculesai.app
        run: |
          set -eu
          if [ -z "$DISPATCH_TOKEN" ]; then
            echo "::error::DISPATCH_TOKEN secret is not set — needed to push the tag back to molecule-core."
            exit 1
          fi
          git config user.name  "publish-runtime autobump"
          git config user.email "publish-runtime@moleculesai.app"
          git tag -a "runtime-v$VERSION" \
            -m "Auto-bump on workspace/** edit on $GITHUB_REF" \
            -m "Triggered by: $GITHUB_REF @ $GITHUB_SHA" \
            -m "publish-runtime.yml will pick up this tag and upload to PyPI"
          # Push via DISPATCH_TOKEN (a Gitea PAT). Using the bot identity
          # ensures the resulting tag-push event is dispatched to
          # publish-runtime.yml; act_runner's default GITHUB_TOKEN cannot
          # trigger downstream workflows.
          git remote set-url origin "${GITEA_URL#https://}"
          git remote set-url origin "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/molecule-ai/molecule-core.git"
          git push origin "runtime-v$VERSION"
          echo "✓ pushed runtime-v$VERSION — publish-runtime.yml should fire next"
--- a/.gitea/workflows/publish-runtime.yml
+++ b/.gitea/workflows/publish-runtime.yml
@ -1,339 +0,0 @@
 name: publish-runtime
 # Gitea Actions port of .github/workflows/publish-runtime.yml.
 #
 # Ported 2026-05-10 (issue #206). Key differences from the GitHub version:
 #   - Gitea Actions reads .gitea/workflows/, not .github/workflows/
 #   - Dropped `environment: pypi-publish` — Gitea Actions does not support
 #     named environments or OIDC trusted publishers
 #   - Replaced `pypa/gh-action-pypi-publish@release/v1` (OIDC) with
 #     `twine upload` using PYPI_TOKEN secret — same mechanism as a local
 #     `python -m twine upload` with a PyPI token
 #   - Replaced `github.ref_name` (GitHub-only) with `${GITHUB_REF#refs/tags/}`
 #     — Gitea Actions exposes github.ref (the full ref) but not ref_name
 #   - Dropped `merge_group` trigger (Gitea has no merge queue)
 #
 # 2026-05-10 (issue #348): originally restored `staging`/`main` branch +
 # `workspace/**` path-filter trigger in PR #349.
 #
 # 2026-05-11 (issue #351): REVERTED the branches+paths trigger from THIS
 # file. Bundling `paths` with `tags` under a single `on.push` key caused
 # Gitea Actions to never dispatch the workflow for tag-push events (0
 # runs in `action_run` for workflow_id='publish-runtime.yml' since the
 # port, including the runtime-v1.0.0 tag — which is why PyPI is still at
 # 0.1.129 despite a v1.0.0 Gitea tag existing).
 #
 # The auto-bump-on-workspace-edit trigger now lives in
 # `.gitea/workflows/publish-runtime-autobump.yml`. That file computes the
 # next version from PyPI's latest and pushes a `runtime-v$VERSION` tag,
 # which THIS file then picks up via the tags-only trigger below.
 #
 # This decoupling means Gitea's path-vs-tag evaluator never has to
 # disambiguate — each file has a single unambiguous trigger shape.
 #
 # PyPI publishing: requires PYPI_TOKEN repository secret (or org-level secret).
 # Set via: repo Settings → Actions → Variables and Secrets → New Secret.
 # The token should be a PyPI API token scoped to molecule-ai-workspace-runtime.
 #
 # The DISPATCH_TOKEN cascade (git push to template repos) is unchanged —
 # it uses the Gitea API directly and was already Gitea-compatible.
 on:
  push:
    tags:
      - "runtime-v*"
  workflow_dispatch:
  # 2026-05-11 (root cause of #351 / 0 runs ever):
  # Gitea 1.22.6's workflow parser rejects `workflow_dispatch.inputs.version`
  # with "unknown on type" — it mis-treats the inputs sub-keys as top-level
  # `on:` event types. Log line:
  #   actions/workflows.go:DetectWorkflows() [W] ignore invalid workflow
  #   "publish-runtime.yml": unknown on type: map["version": {...}]
  # That `[W] ignore invalid workflow` is silent UX — the workflow never
  # registers, so it never fires for ANY event (push.tags included).
  # Removing the inputs block restores parsing. Manual dispatch from the
  # Gitea UI now triggers the PyPI auto-bump fallback in `Derive version`
  # below (no `inputs.version` to read).
 permissions:
  contents: read
 # Serialize publishes so two concurrent tag pushes don't both compute
 # "latest+1" and race on PyPI upload. The second one waits.
 concurrency:
  group: publish-runtime
  cancel-in-progress: false
 jobs:
  publish:
    runs-on: ubuntu-latest
    outputs:
      version: ${{ steps.version.outputs.version }}
      wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
          cache: pip
      - name: Derive version (tag or PyPI auto-bump)
        id: version
        run: |
          if echo "$GITHUB_REF" | grep -q "^refs/tags/runtime-v"; then
            # Tag is `runtime-vX.Y.Z` — strip the prefix.
            VERSION="${GITHUB_REF#refs/tags/runtime-v}"
          else
            # workflow_dispatch path (no inputs supported on Gitea 1.22.6) or
            # any other non-tag trigger: derive from PyPI latest + patch bump.
            LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
              | python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
            MAJOR=$(echo "$LATEST" | cut -d. -f1)
            MINOR=$(echo "$LATEST" | cut -d. -f2)
            PATCH=$(echo "$LATEST" | cut -d. -f3)
            VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
            echo "Auto-bumped from PyPI latest $LATEST -> $VERSION"
          fi
          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
            echo "::error::version $VERSION does not match PEP 440"
            exit 1
          fi
          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
          echo "Publishing molecule-ai-workspace-runtime $VERSION"
      - name: Install build tooling
        run: pip install build twine
      - name: Build package from workspace/
        run: |
          python scripts/build_runtime_package.py \
            --version "${{ steps.version.outputs.version }}" \
            --out "${{ runner.temp }}/runtime-build"
      - name: Build wheel + sdist
        working-directory: ${{ runner.temp }}/runtime-build
        run: python -m build
      - name: Capture wheel SHA256 for cascade content-verification
        id: wheel_hash
        working-directory: ${{ runner.temp }}/runtime-build
        run: |
          set -eu
          WHEEL=$(ls dist/*.whl 2>/dev/null | head -1)
          if [ -z "$WHEEL" ]; then
            echo "::error::No .whl in dist/ — \`python -m build\` must have failed silently"
            exit 1
          fi
          HASH=$(sha256sum "$WHEEL" | awk '{print $1}')
          echo "wheel_sha256=${HASH}" >> "$GITHUB_OUTPUT"
          echo "Local wheel SHA256 (pre-upload): ${HASH}"
          echo "Wheel filename: $(basename "$WHEEL")"
      - name: Verify package contents (sanity)
        working-directory: ${{ runner.temp }}/runtime-build
        run: |
          python -m twine check dist/*
          python -m venv /tmp/smoke
          /tmp/smoke/bin/pip install --quiet dist/*.whl
          /tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
      - name: Publish to PyPI
        # working-directory matches the preceding Build/Verify steps. Without
        # this, twine runs from the default workspace checkout dir where
        # `dist/` doesn't exist and fails with:
        #   ERROR InvalidDistribution: Cannot find file (or expand pattern): 'dist/*'
        # Caught on the first-ever successful dispatch of this workflow
        # (run 5097, 2026-05-11 02:08Z) — every other step in the publish
        # job already had this working-directory; Publish was missing it.
        working-directory: ${{ runner.temp }}/runtime-build
        env:
          # PYPI_TOKEN: repository secret scoped to molecule-ai-workspace-runtime.
          # Set via: Settings → Actions → Variables and Secrets → New Secret.
          # Format: pypi-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
        run: |
          if [ -z "$PYPI_TOKEN" ]; then
            echo "::error::PYPI_TOKEN secret is not set — set it at Settings → Actions → Variables and Secrets → New Secret."
            echo "::error::Required format: pypi-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
            exit 1
          fi
          python -m twine upload \
            --repository pypi \
            --username __token__ \
            --password "$PYPI_TOKEN" \
            dist/*
  cascade:
    needs: publish
    runs-on: ubuntu-latest
    steps:
      - name: Wait for PyPI to propagate the new version
        env:
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
          EXPECTED_SHA256: ${{ needs.publish.outputs.wheel_sha256 }}
        run: |
          set -eu
          if [ -z "$EXPECTED_SHA256" ]; then
            echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
            exit 1
          fi
          python -m venv /tmp/propagation-probe
          PROBE=/tmp/propagation-probe/bin
          $PROBE/pip install --upgrade --quiet pip
          for i in $(seq 1 30); do
            if $PROBE/pip install \
                  --quiet \
                  --no-cache-dir \
                  --force-reinstall \
                  --no-deps \
                  "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
                  >/dev/null 2>&1; then
              INSTALLED=$($PROBE/pip show molecule-ai-workspace-runtime 2>/dev/null \
                          | awk -F': ' '/^Version:/{print $2}')
              if [ "$INSTALLED" = "$RUNTIME_VERSION" ]; then
                echo "✓ PyPI resolved $RUNTIME_VERSION (install check)"
                break
              fi
            fi
            if [ $i -eq 30 ]; then
              echo "::error::pip install --no-cache-dir molecule-ai-workspace-runtime==${RUNTIME_VERSION} never resolved within ~5 min."
              echo "::error::Refusing to fan out cascade against a potentially stale PyPI index."
              exit 1
            fi
            echo "  [$i/30] waiting for PyPI to propagate ${RUNTIME_VERSION}..."
            sleep 4
          done
          # Stage (b): download wheel + SHA256 compare against what we built.
          # Catches Fastly stale-content serving old bytes under a new version URL.
          #
          # Caught run 5196 (first-ever successful publish, 2026-05-11): the
          # previous one-liner `HASH=$(pip download ... && sha256sum ...)`
          # captured pip's stdout (`Collecting molecule-ai-workspace-runtime
          # ==X.Y.Z`) into HASH, then the SHA comparison failed against the
          # leaked `Collecting...` string. `2>/dev/null` silences stderr but
          # NOT stdout; pip writes its progress to stdout by default.
          # Fix: split into two steps, silence pip's stdout explicitly, capture
          # only sha256sum's output into HASH.
          python -m pip download \
            --no-deps \
            --no-cache-dir \
            --dest /tmp/wheel-probe \
            --quiet \
            "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
            >/dev/null 2>&1
          HASH=$(sha256sum /tmp/wheel-probe/*.whl | awk '{print $1}')
          if [ "$HASH" != "$EXPECTED_SHA256" ]; then
            echo "::error::PyPI propagated $RUNTIME_VERSION but wheel content SHA256 mismatch."
            echo "::error::Expected: $EXPECTED_SHA256"
            echo "::error::Got:      $HASH"
            echo "::error::Fastly may be serving stale content. Refusing to fan out cascade."
            exit 1
          fi
          echo "✓ PyPI CDN verified (SHA256 match)"
      - name: Fan out via push to .runtime-version
        env:
          # Gitea PAT with write:repository scope on the 8 cascade-active
          # template repos. Used for git push to each template repo's main
          # branch, which trips their `on: push: branches: [main]` trigger
          # on publish-image.yml.
          DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
        run: |
          set +e   # don't abort on a single repo failure — collect them all
          if [ -z "$DISPATCH_TOKEN" ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
              echo "::warning::set it at Settings → Actions → Variables and Secrets → New Secret."
              exit 0
            fi
            echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
            echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version."
            exit 1
          fi
          VERSION="$RUNTIME_VERSION"
          if [ -z "$VERSION" ]; then
            echo "::error::publish job did not expose a version output"
            exit 1
          fi
          GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
          TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
          FAILED=""
          SKIPPED=""
          git config --global user.name  "publish-runtime cascade"
          git config --global user.email "publish-runtime@moleculesai.app"
          WORKDIR="$(mktemp -d)"
          for tpl in $TEMPLATES; do
            REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
            CLONE="$WORKDIR/$tpl"
            HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
              -H "Authorization: token $DISPATCH_TOKEN" \
              "$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
            if [ "$HTTP" = "404" ]; then
              echo "↷ $tpl has no publish-image.yml — soft-skip"
              SKIPPED="$SKIPPED $tpl"
              continue
            fi
            attempt=0
            success=false
            while [ $attempt -lt 3 ]; do
              attempt=$((attempt + 1))
              rm -rf "$CLONE"
              if ! git clone --depth=1 \
                  "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
                  "$CLONE" >/tmp/clone.log 2>&1; then
                echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
                sleep 2
                continue
              fi
              cd "$CLONE"
              echo "$VERSION" > .runtime-version
              if git diff --quiet -- .runtime-version; then
                echo "✓ $tpl already at $VERSION — no commit needed"
                success=true
                cd - >/dev/null
                break
              fi
              git add .runtime-version
              git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
                -m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
                >/dev/null
              if git push origin HEAD:main >/tmp/push.log 2>&1; then
                echo "✓ $tpl pushed $VERSION on attempt $attempt"
                success=true
                cd - >/dev/null
                break
              fi
              echo "::warning::push $tpl attempt $attempt failed, pull-rebasing"
              git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
              cd - >/dev/null
            done
            if [ "$success" != "true" ]; then
              FAILED="$FAILED $tpl"
            fi
          done
          rm -rf "$WORKDIR"
          if [ -n "$FAILED" ]; then
            echo "::error::Cascade incomplete after 3 retries each. Failed:$FAILED"
            exit 1
          fi
          if [ -n "$SKIPPED" ]; then
            echo "Cascade complete: pinned $VERSION. Soft-skipped (no publish-image.yml):$SKIPPED"
          else
            echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
          fi
--- a/.gitea/workflows/publish-workspace-server-image.yml
+++ b/.gitea/workflows/publish-workspace-server-image.yml
@ -1,177 +0,0 @@
 name: publish-workspace-server-image
 # Gitea Actions port of .github/workflows/publish-workspace-server-image.yml.
 #
 # Ported 2026-05-10 (issue #228). Key differences from the GitHub version:
 #   - Gitea Actions reads .gitea/workflows/, not .github/workflows/
 #   - Dropped `environment:` declarations — Gitea Actions does not support
 #     named environments (used by GitHub OIDC token gates)
 #   - Replaced `github.ref_name` (GitHub-only) with `${GITHUB_REF#refs/heads/}`
 #     — Gitea Actions exposes GITHUB_REF in the same format as GitHub Actions
 #   - docker/setup-buildx-action and aws-actions/configure-aws-credentials are
 #     GitHub Marketplace actions; they are installed by Gitea Actions runners and
 #     work identically here
 #   - All other variables (GITHUB_SHA, GITHUB_REPOSITORY, GITHUB_OUTPUT,
 #     secrets.*) use the same syntax as GitHub Actions
 #
 # Image tags produced:
 #   :staging-<sha> — per-commit digest, stable for canary verify
 #   :staging-latest — tracks most recent build on this branch
 #
 # ECR target: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*
 # Required secrets: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AUTO_SYNC_TOKEN
 #
 # mc#711: Docker daemon not accessible on ubuntu-latest runner (molecule-canonical-1
 # shows client-only in `docker info` — daemon not running). DinD mount is present but
 # daemon doesn't respond. Fix: add diagnostic step showing socket info so ops can
 # identify which runners have a live daemon. If no daemon is available, the job
 # fails fast with actionable output rather than silent deep failure.
 on:
  push:
    branches: [main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
      - 'scripts/**'
      - '.gitea/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:
 # Serialize per-branch so two rapid main pushes don't race the same
 # :staging-latest tag retag. Allow parallel runs as they produce
 # different :staging-<sha> tags and last-write-wins on :staging-latest.
 #
 # cancel-in-progress: false → in-flight builds finish; the next push's
 # build queues. This avoids a partially-pushed image.
 concurrency:
  group: publish-workspace-server-image-${{ github.ref }}
  cancel-in-progress: false
 permissions:
  contents: read
  packages: write
 env:
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Diagnose Docker daemon access
        run: |
          set -euo pipefail
          echo "::group::Docker daemon diagnosis"
          echo "Runner: ${HOSTNAME:-unknown}"
          echo "--- Socket info ---"
          ls -la /var/run/docker.sock 2>/dev/null || echo "/var/run/docker.sock: not found"
          stat /var/run/docker.sock 2>/dev/null || true
          echo "--- User info ---"
          id
          echo "--- docker version ---"
          docker version 2>&1 || true
          echo "--- docker info (full) ---"
          docker info 2>&1 || echo "docker info failed: exit $?"
          echo "::endgroup::"
      # Pre-clone manifest deps before docker build.
      #
      # Why: workspace-template-* repos on Gitea are private. The pre-fix
      # Dockerfile.tenant ran `git clone` inside an in-image stage with no
      # auth path — every CI build failed. We clone in the trusted CI
      # context where AUTO_SYNC_TOKEN is available and Dockerfile.tenant
      # just COPYs from .tenant-bundle-deps/.
      #
      # Token: AUTO_SYNC_TOKEN is the devops-engineer persona PAT.
      # clone-manifest.sh embeds it as basic-auth for the clones, then
      # strips .git dirs — the token never enters the image.
      - name: Pre-clone manifest deps
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          mkdir -p .tenant-bundle-deps
          # Strip JSON5 comments before jq parsing — Integration Tester appends
          # `// Triggered by ...` which breaks `jq` in clone-manifest.sh.
          sed '/^[[:space:]]*\/\//d' manifest.json > .manifest-stripped.json
          bash scripts/clone-manifest.sh \
            .manifest-stripped.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      # Build + push platform image (inline ECR auth — mirrors the operator-host
      # approach; credentials come from GITHUB_SECRET_AWS_ACCESS_KEY_ID /
      # GITHUB_SECRET_AWS_SECRET_ACCESS_KEY in Gitea Actions).
      # docker buildx bake / build required for `imagetools inspect` digest
      # capture in the CP pin-update step (RFC internal#229 §X step 4 PR-1).
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd  # v4.0.0
      - name: Build & push platform image to ECR (staging-<sha> + staging-latest)
        env:
          IMAGE_NAME: ${{ env.IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          ECR_REGISTRY="${IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker buildx build \
            --file ./workspace-server/Dockerfile \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
            --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
            --tag "${IMAGE_NAME}:${TAG_SHA}" \
            --tag "${IMAGE_NAME}:${TAG_LATEST}" \
            --push .
      # Build + push tenant image (Go platform + Next.js canvas in one image).
      - name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
        env:
          TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker buildx build \
            --file ./workspace-server/Dockerfile.tenant \
            --build-arg NEXT_PUBLIC_PLATFORM_URL= \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
            --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
            --push .
--- a/.gitea/workflows/qa-review.yml
+++ b/.gitea/workflows/qa-review.yml
@ -1,164 +0,0 @@
 # qa-review — non-author APPROVE from the `qa` Gitea team required to merge.
 #
 # RFC#324 Step 1 of 5 (workflow-add). Pairs with `security-review.yml` and the
 # branch-protection flip in Step 2.
 #
 # === DESIGN (RFC#324 v1.1 addendum) ===
 #
 # A1-α (refire mechanism):
 #   Triggers on:
 #     - `pull_request_target`: opened, synchronize, reopened
 #         → initial status posts when PR opens / re-pushes
 #     - `issue_comment`: /qa-recheck slash-command on the PR
 #         → manual re-fire after a QA reviewer clicks APPROVE
 #           (Gitea 1.22.6 doesn't re-fire on pull_request_review, per
 #           go-gitea/gitea#33700 + feedback_pull_request_review_no_refire)
 #   Workflow name = `qa-review` ; job name = `approved`.
 #   The job's own pass/fail conclusion publishes the status context
 #   `qa-review / approved (<event>)` — NO `POST /statuses` call → NO
 #   write:repository token scope needed. Sidesteps internal#321 defect #2.
 #
 # A1.1 (privilege check on slash-comment — INFORMATIONAL ONLY, NOT a gate):
 #   The `issue_comment` event fires for ANY commenter, including
 #   non-collaborators. The original (v1.2) design gated the eval step
 #   behind a collaborator probe → if a non-collaborator commented
 #   /qa-recheck, the eval was `if:`-skipped → the job exited 0 anyway →
 #   the status context published `success` with ZERO real APPROVE.
 #   That was a fail-open: any visitor could green the gate.
 #
 #   RFC#324 v1.3 §A1.1 correction (option b per hongming-pc 1421):
 #   drop privilege-gating of the evaluation entirely. The eval is
 #   read-only and idempotent — it reads `pulls/{N}/reviews` and
 #   `teams/{id}/members/{u}` (both API-side state that a commenter can't
 #   change). Re-running it on a non-collaborator's comment is harmless
 #   AND correct: if a real team-member APPROVE exists, the eval flips
 #   green; if not, it stays red.
 #
 #   We KEEP the privilege step as a `::notice::` log line only — useful
 #   for griefer-spotting (one operator spamming /recheck) without
 #   touching the gate. If rate-limiting is needed later, add it as a
 #   separate concern (time-window throttle, not a privilege gate).
 #
 #   We MUST NOT use `github.event.comment.author_association` (the
 #   field doesn't exist on Gitea 1.22.6 webhook payload — this was
 #   sop-tier-refire's defect #1).
 #
 # A4 (no PR-head checkout under pull_request_target):
 #   We check out the BASE ref explicitly so the review-check.sh script is
 #   loaded from trusted source. We NEVER use `ref: ${{ github.event.pull_request.head.sha }}`.
 #   No PR-head code is executed in the runner. Trust boundary preserved.
 #
 # A5 (real Gitea team):
 #   `qa` team (id=20) verified by orchestrator preflight 2026-05-11; queried
 #   at run time via /api/v1/teams/20/members/{login}.
 #
 # === TOKEN ===
 #
 # The workflow reads PR state, PR reviews, and team membership.
 # Gitea 1.22.6's /api/v1/teams/{id}/members/{u} returns 403 ('Must be a
 # team member') for tokens whose owner is not in that team. The default
 # `secrets.GITHUB_TOKEN` is owned by a workflow-scoped identity that is
 # also not in qa/security teams → also 403.
 #
 # Resolution: a dedicated `RFC_324_TEAM_READ_TOKEN` secret, owned by an
 # identity that IS in both `qa` and `security` teams (Owners-tier
 # claude-ceo-assistant, or a new service-bot added to both teams).
 # Provisioning of this secret is tracked as a follow-up issue (filed by
 # core-devops at PR open).
 #
 # Until that secret is provisioned, the job will exit 1 with a clear
 # 403-on-team-probe error and the `qa-review / approved` status will
 # stay `failure`. This is the correct fail-closed behavior — the gate
 # blocks merge until both (a) a QA team member APPROVEs and (b) the
 # workflow has a token that can confirm their team membership.
 #
 # === SLASH-COMMAND CONTRACT ===
 #
 #   /qa-recheck   — re-evaluate the gate (e.g. after an APPROVE lands)
 #
 # Open to any PR commenter. The eval is read-only and idempotent, so
 # unprivileged refires are harmless (RFC#324 v1.3 §A1.1). Collaborator
 # status is logged for griefer-spotting but does NOT gate execution.
 name: qa-review
 on:
  pull_request_target:
    types: [opened, synchronize, reopened]
  issue_comment:
    types: [created]
 permissions:
  contents: read
  pull-requests: read
 jobs:
  approved:
    # Gate the job:
    #   - On pull_request_target events: always run.
    #   - On issue_comment events: only when it's a PR comment and the body
    #     contains the slash-command. NO privilege gate at the step level
    #     (RFC#324 v1.3 §A1.1): a non-collaborator's /qa-recheck is fine
    #     because the eval is read-only and idempotent — re-running it
    #     just re-confirms whether a real team-member APPROVE exists.
    if: |
      github.event_name == 'pull_request_target' ||
      (github.event_name == 'issue_comment' &&
       github.event.issue.pull_request != null &&
       startsWith(github.event.comment.body, '/qa-recheck'))
    runs-on: ubuntu-latest
    steps:
      - name: Privilege check (A1.1 — INFORMATIONAL log only, NOT a gate)
        # RFC#324 v1.3 §A1.1: this step does NOT gate subsequent steps.
        # It exists solely as a log line for griefer-spotting (one
        # operator spamming /qa-recheck without merit). Re-running the
        # read-only eval on a non-collaborator comment is harmless;
        # gating it would be fail-open (skipped steps still publish
        # `success` for the job's status context).
        # Only runs on issue_comment events; pull_request_target has
        # no comment.user.login so the step is a no-op skip there.
        if: github.event_name == 'issue_comment'
        env:
          GITEA_TOKEN: ${{ secrets.RFC_324_TEAM_READ_TOKEN || secrets.GITHUB_TOKEN }}
        run: |
          set -euo pipefail
          login="${{ github.event.comment.user.login }}"
          # Write token to a mode-600 file so it never appears in curl's argv.
          # (#541: -H "Authorization: token $TOKEN" puts the secret in /proc/<pid>/cmdline)
          authfile=$(mktemp)
          chmod 600 "$authfile"
          printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$authfile"
          code=$(curl -sS -o /dev/null -w '%{http_code}' -K "$authfile" \
            "${{ github.server_url }}/api/v1/repos/${{ github.repository }}/collaborators/${login}")
          rm -f "$authfile"
          if [ "$code" = "204" ]; then
            echo "::notice::Recheck from ${login} (collaborator=true)"
          else
            echo "::notice::Recheck from ${login} (collaborator=false, HTTP ${code}) — proceeding with read-only eval anyway"
          fi
      - name: Check out BASE ref (A4 — never PR-head)
        # Loads the review-check.sh script from a trusted ref. For
        # pull_request_target the default checkout is BASE already; we
        # set ref explicitly for the issue_comment event too so the
        # script source is always the default-branch version.
        # NEVER use ref: ${{ github.event.pull_request.head.sha }} —
        # that would execute PR-head code with secrets-context.
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ github.event.repository.default_branch }}
      - name: Evaluate qa-review
        env:
          GITEA_TOKEN: ${{ secrets.RFC_324_TEAM_READ_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          # PR number lives in different places per event:
          #   pull_request_target → github.event.pull_request.number
          #   issue_comment       → github.event.issue.number
          PR_NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }}
          TEAM: qa
          TEAM_ID: '20'
          REVIEW_CHECK_DEBUG: '0'
          REVIEW_CHECK_STRICT: '0'
        run: bash .gitea/scripts/review-check.sh
--- a/.gitea/workflows/railway-pin-audit.yml
+++ b/.gitea/workflows/railway-pin-audit.yml
@ -1,181 +0,0 @@
 name: Railway pin audit (drift detection)
 # Ported from .github/workflows/railway-pin-audit.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `workflow_dispatch:` (Gitea 1.22.6 trigger handling).
 #     Manual runs go via cron-trigger bump or push the workflow file
 #     itself.
 #   - `actions/github-script@v9` blocks (which call github.rest.* — a
 #     GitHub-specific JS API) replaced with curl calls against the
 #     Gitea REST API (/api/v1/repos/.../issues, .../labels,
 #     .../comments). Same behaviour: open issue on drift, comment on
 #     repeat-drift, close on clean run.
 #   - Workflow-level env.GITHUB_SERVER_URL set so the curl calls can
 #     derive `git.moleculesai.app` from the runner env (with
 #     hard-coded fallback inside the steps).
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Daily audit of Railway env vars for drift-prone image-tag pins —
 # automation-cadence layer over the detection script + regression test
 # shipped in PR #2168 (#2001 closure).
 #
 # Background: on 2026-04-24 a stale `:staging-a14cf86` SHA pin in CP's
 # TENANT_IMAGE caused 3+ hours of E2E failure with the appearance that
 # "every fix didn't propagate" — really the tenant image was so old it
 # didn't read the env vars those fixes produced.
 #
 # Cadence: once a day, 13:00 UTC (06:00 PT).
 #
 # Secret hardening: per feedback_schedule_vs_dispatch_secrets_hardening,
 # the schedule trigger HARD-FAILS on missing RAILWAY_AUDIT_TOKEN.
 on:
  schedule:
    - cron: '0 13 * * *'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: railway-pin-audit
  cancel-in-progress: false
 permissions:
  issues: write
  contents: read
 jobs:
  audit:
    name: Audit Railway env vars for drift-prone pins
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 10
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify RAILWAY_AUDIT_TOKEN present
        env:
          RAILWAY_AUDIT_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        id: secret_check
        run: |
          set -euo pipefail
          if [ -n "${RAILWAY_AUDIT_TOKEN:-}" ]; then
            echo "have_secret=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          echo "have_secret=false" >> "$GITHUB_OUTPUT"
          echo "::error::RAILWAY_AUDIT_TOKEN secret missing — schedule trigger requires it. Provision the token (read-only \`variables\` scope on the molecule-platform Railway project) and store as repo secret RAILWAY_AUDIT_TOKEN."
          exit 1
      - name: Install Railway CLI
        if: steps.secret_check.outputs.have_secret == 'true'
        run: |
          set -euo pipefail
          curl -fsSL https://railway.com/install.sh | sh
          echo "$HOME/.railway/bin" >> "$GITHUB_PATH"
      - name: Verify Railway CLI authenticated
        if: steps.secret_check.outputs.have_secret == 'true'
        env:
          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        run: |
          set -euo pipefail
          if ! railway whoami >/dev/null 2>&1; then
            echo "::error::Railway CLI failed to authenticate with RAILWAY_AUDIT_TOKEN — token may be revoked or scoped incorrectly"
            exit 2
          fi
      - name: Link molecule-platform project
        if: steps.secret_check.outputs.have_secret == 'true'
        env:
          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        run: |
          set -euo pipefail
          railway link --project 7ccc8c68-61f4-42ab-9be5-586eeee11768
      - name: Run drift audit
        if: steps.secret_check.outputs.have_secret == 'true'
        id: audit
        env:
          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        run: |
          set +e
          bash scripts/ops/audit-railway-sha-pins.sh 2>&1 | tee /tmp/audit.log
          rc=${PIPESTATUS[0]}
          echo "rc=$rc" >> "$GITHUB_OUTPUT"
          # Capture the audit log for the issue body.
          {
            echo 'log<<AUDIT_EOF'
            cat /tmp/audit.log
            echo 'AUDIT_EOF'
          } >> "$GITHUB_OUTPUT"
          case "$rc" in
            0) exit 0 ;;
            1) echo "::warning::Drift-prone pin(s) detected — issue will be filed"; exit 1 ;;
            2) echo "::error::Railway CLI auth/link failed mid-script — token or project ID drift"; exit 2 ;;
            *) echo "::error::Unexpected audit rc=$rc"; exit 1 ;;
          esac
      - name: Open / update drift issue (Gitea API)
        if: failure() && steps.audit.outputs.rc == '1'
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          AUDIT_LOG: ${{ steps.audit.outputs.log }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          TITLE="Railway env-var drift detected"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          BODY=$(jq -nc --arg t "$TITLE" --arg log "${AUDIT_LOG:-(log unavailable)}" --arg run "$RUN_URL" '
            {body: ("Daily Railway pin audit found drift-prone image-tag pins in the molecule-platform Railway project.\n\n**What this means:** an env var (likely on `controlplane`) is pinned to a SHA-shaped or semver tag instead of a floating tag. Same pattern that caused the 2026-04-24 TENANT_IMAGE incident — fix-PRs land but the running service does not pick them up.\n\n**Recovery:** open the Railway dashboard, replace the flagged value with a floating tag (:staging-latest, :main) unless the pin is intentional and documented in the ops runbook.\n\n**Audit output:**\n\n```\n" + $log + "\n```\n\nRun: " + $run + "\n\nCloses automatically when a subsequent daily run reports clean.")}')
          # Look for existing open drift issue with the title.
          EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
          if [ -n "$EXISTING" ]; then
            COMMENT_BODY=$(jq -nc --arg log "${AUDIT_LOG:-(log unavailable)}" --arg run "$RUN_URL" \
              '{body: ("Still drifting. " + $run + "\n\n```\n" + $log + "\n```")}')
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${EXISTING}/comments" -d "$COMMENT_BODY" >/dev/null
            echo "Commented on existing issue #${EXISTING}"
          else
            CREATE_BODY=$(echo "$BODY" | jq --arg t "$TITLE" '. + {title: $t, labels: []}')
            NUM=$(curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues" -d "$CREATE_BODY" | jq -r .number)
            echo "Filed issue #${NUM}"
          fi
      - name: Close stale drift issue on clean run (Gitea API)
        if: success() && steps.audit.outputs.rc == '0'
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          TITLE="Railway env-var drift detected"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          NUMS=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number')
          for N in $NUMS; do
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}/comments" \
              -d "$(jq -nc --arg run "$RUN_URL" '{body: ("Daily audit clean — drift resolved. " + $run)}')" >/dev/null
            curl -fsS -X PATCH -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}" -d '{"state":"closed"}' >/dev/null
            echo "Closed #${N}"
          done
--- a/.gitea/workflows/redeploy-tenants-on-main.yml
+++ b/.gitea/workflows/redeploy-tenants-on-main.yml
@ -1,375 +0,0 @@
 name: redeploy-tenants-on-main
 # Ported from .github/workflows/redeploy-tenants-on-main.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with
 #     push+paths filter per this PR. Gitea 1.22.6 does not support
 #     `workflow_run` (task #81). The push trigger fires on every
 #     commit to publish-workspace-server-image.yml which is the
 #     same signal (only successful runs commit to main).
 #
 # Auto-refresh prod tenant EC2s after every main merge.
 #
 # Why this workflow exists: publish-workspace-server-image builds and
 # pushes a new platform-tenant :<sha> to ECR on every merge to main,
 # but running tenants pulled their image once at boot and never re-pull.
 # Users see stale code indefinitely.
 #
 # This workflow closes the gap by calling the control-plane admin
 # endpoint that performs a canary-first, batched, health-gated rolling
 # redeploy across every live tenant. Implemented in molecule-ai/
 # molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
 # (feat/tenant-auto-redeploy, landing alongside this workflow).
 #
 # Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/
 # molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the
 # Gitea suspension migration. The staging-verify.yml promote step now
 # uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap).
 #
 # Runtime ordering:
 #   1. publish-workspace-server-image completes → new :staging-<sha> in ECR.
 #   2. This workflow fires via workflow_run, calls redeploy-fleet with
 #      target_tag=staging-<sha>. No CDN propagation wait needed —
 #      ECR image manifest is consistent immediately after push.
 #   3. Calls redeploy-fleet with canary_slug (if set) and a soak
 #      period. Canary proves the image boots; batches follow.
 #   4. Any failure aborts the rollout and leaves older tenants on the
 #      prior image — safer default than half-and-half state.
 #
 # Rollback path: re-run this workflow with a specific SHA pinned via
 # the workflow_dispatch input. That calls redeploy-fleet with
 # target_tag=<sha>, re-pulling the older image on every tenant.
 on:
  push:
    branches: [main]
    paths:
      - '.gitea/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:
 permissions:
  contents: read
  # No write scopes needed — the workflow hits an external CP endpoint,
  # not the GitHub API.
 # Serialize redeploys so two rapid main pushes' redeploys don't overlap
 # and cause confusing per-tenant SSM state. Without this, GitHub's
 # implicit workflow_run queueing would *probably* serialize them, but
 # the explicit block makes the invariant defensible. Mirrors the
 # concurrency block on redeploy-tenants-on-staging.yml for shape parity.
 #
 # cancel-in-progress: false → aborting a half-rolled-out fleet would
 # leave tenants stuck on whatever image they happened to be on when
 # cancelled. Better to finish the in-flight rollout before starting
 # the next one.
 concurrency:
  group: redeploy-tenants-on-main
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  redeploy:
    # Skip the auto-trigger if publish-workspace-server-image didn't
    # actually succeed. workflow_run fires on any completion state; we
    # don't want to redeploy against a half-built image.
    # NOTE (Gitea port): workflow_dispatch trigger dropped; only the
    # workflow_run path remains.
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 25
    steps:
      - name: Note on ECR propagation
        # ECR image manifests are consistent immediately after push — no
        # CDN cache to wait for. The old GHCR-based workflow had a 30s
        # sleep to avoid race conditions; ECR makes that unnecessary.
        run: echo "ECR image available immediately after push — proceeding."
      - name: Compute target tag
        id: tag
        # Resolution order:
        #   1. Operator-supplied input (workflow_dispatch with explicit
        #      tag) → used verbatim. Lets ops pin `latest` for emergency
        #      rollback to last canary-verified digest, or pin a specific
        #      `staging-<sha>` to roll back to a known-good build.
        #   2. Default → `staging-<short_head_sha>`. The just-published
        #      digest. Bypasses the `:latest` retag path that's currently
        #      dead (staging-verify soft-skips without canary fleet, so
        #      the only thing retagging `:latest` today is the manual
        #      promote-latest.yml — last run 2026-04-28). Auto-trigger
        #      from workflow_run uses workflow_run.head_sha; manual
        #      dispatch with no input falls through to github.sha.
        env:
          INPUT_TAG: ${{ inputs.target_tag }}
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
        run: |
          set -euo pipefail
          if [ -n "${INPUT_TAG:-}" ]; then
            echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT"
            echo "Using operator-pinned tag: $INPUT_TAG"
          else
            SHORT="${HEAD_SHA:0:7}"
            echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT"
            echo "Using auto tag: staging-$SHORT (head_sha=$HEAD_SHA)"
          fi
      - name: Call CP redeploy-fleet
        # CP_ADMIN_API_TOKEN must be set as a repo/org secret on
        # molecule-ai/molecule-core, matching the staging/prod CP's
        # CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
        # repo's secrets for CI.
        env:
          CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
          CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
          TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
          CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }}
          SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
          BATCH_SIZE: ${{ inputs.batch_size || '3' }}
          DRY_RUN: ${{ inputs.dry_run || false }}
        run: |
          set -euo pipefail
          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
            echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
            echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
            exit 1
          fi
          BODY=$(jq -nc \
            --arg tag "$TARGET_TAG" \
            --arg canary "$CANARY_SLUG" \
            --argjson soak "$SOAK_SECONDS" \
            --argjson batch "$BATCH_SIZE" \
            --argjson dry "$DRY_RUN" \
            '{
              target_tag: $tag,
              canary_slug: $canary,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          # Route -w into its own tempfile so curl's exit code (e.g. 56
          # on connection-reset, 22 on --fail-with-body 4xx/5xx) can't
          # pollute the captured stdout. The previous inline-substitution
          # shape produced "000000" on connection reset (curl wrote
          # "000" via -w, then the inline echo-fallback appended another
          # "000") — caught on the 2026-05-04 redeploy of sha 2b862f6.
          # set +e/-e keeps the non-zero curl exit from tripping the
          # outer pipeline. See lint-curl-status-capture.yml for the
          # CI gate that pins this fix shape.
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          set -e
          # Stderr from curl (e.g. dial errors with -sS) goes to the runner
          # log so operators can see WHY a connection failed. Stdout is
          # captured to $HTTP_CODE_FILE because that's where -w writes.
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          # Pretty-print per-tenant results in the job summary so
          # ops can see which tenants were redeployed without drilling
          # into the raw response.
          {
            echo "## Tenant redeploy fleet"
            echo ""
            echo "**Target tag:** \`$TARGET_TAG\`"
            echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
            echo "**Batch size:** $BATCH_SIZE"
            echo "**Dry run:** $DRY_RUN"
            echo "**HTTP:** $HTTP_CODE"
            echo ""
            echo "### Per-tenant result"
            echo ""
            echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
            echo '|------|-------|------------|------|---------|-------|'
            jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
          } >> "$GITHUB_STEP_SUMMARY"
          if [ "$HTTP_CODE" != "200" ]; then
            echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
            exit 1
          fi
          OK=$(jq -r '.ok' "$HTTP_RESPONSE")
          if [ "$OK" != "true" ]; then
            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
            exit 1
          fi
          echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
          # Stash the response for the verify step. $RUNNER_TEMP outlasts
          # the step boundary; $HTTP_RESPONSE doesn't.
          cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
      - name: Verify each tenant /buildinfo matches published SHA
        # ROOT FIX FOR #2395.
        #
        # `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC
        # didn't error" — NOT "the new image is running on the tenant."
        # `:latest` lives in the local Docker daemon's image cache; if
        # the SSM document does `docker compose up -d` without an
        # explicit `docker pull`, the daemon serves the previously-
        # cached digest and the container restarts on stale code.
        # 2026-04-30 incident: hongmingwang's tenant reported
        # ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7
        # chat_files for 30+ min — the lazy-heal fix never reached the
        # user despite green deploy + green redeploy.
        #
        # This step closes the gap by curling each tenant's /buildinfo
        # endpoint (added in workspace-server/internal/buildinfo +
        # /Dockerfile* GIT_SHA build-arg, this PR) and comparing the
        # returned git_sha to the SHA the workflow expects. Mismatches
        # fail the workflow, which is what `ok=true` should have
        # guaranteed all along.
        #
        # When the redeploy was triggered by workflow_dispatch with a
        # specific tag (target_tag != "latest"), the expected SHA may
        # not equal ${{ github.sha }} — in that case we resolve via
        # GHCR's manifest. For workflow_run (default :latest) the
        # workflow_run.head_sha is the SHA that just published.
        env:
          EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
          # Tenant subdomain template — slugs from the response are
          # appended. Production CP issues `<slug>.moleculesai.app`;
          # staging CP issues `<slug>.staging.moleculesai.app`. This
          # workflow runs on main → prod CP → no `staging.` infix.
          TENANT_DOMAIN: 'moleculesai.app'
        run: |
          set -euo pipefail
          EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
          if [ "$TARGET_TAG" != "latest" ] \
             && [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \
             && [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then
            # workflow_dispatch with a pinned tag that isn't the head
            # SHA — operator is rolling back / pinning. Skip the
            # verification because we don't have the expected SHA in
            # this context (would need to crane-inspect the GHCR
            # manifest, which is a follow-up). Failing-open here is
            # safe: the operator chose the tag deliberately.
            #
            # `staging-<short_head_sha>` IS verified — it's the new
            # auto-trigger default (see Compute target tag step) and
            # the digest under that tag SHOULD match EXPECTED_SHA.
            echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
            exit 0
          fi
          RESP="$RUNNER_TEMP/redeploy-response.json"
          if [ ! -s "$RESP" ]; then
            echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read"
            exit 1
          fi
          # Pull only successfully-redeployed tenants. Any tenant that
          # halted the rollout already failed the previous step, so we
          # don't double-count them here.
          mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
          if [ ${#SLUGS[@]} -eq 0 ]; then
            echo "::warning::No tenants reported healthz_ok — nothing to verify"
            exit 0
          fi
          echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
          # Two distinct failure modes — STALE (the #2395 bug class, hard-fail)
          # vs UNREACHABLE (teardown race, soft-warn). See the staging variant's
          # comment for the full rationale; same logic applies on prod even
          # though prod has fewer ephemeral tenants — the asymmetry would be a
          # gratuitous fork.
          STALE_COUNT=0
          UNREACHABLE_COUNT=0
          STALE_LINES=()
          UNREACHABLE_LINES=()
          for slug in "${SLUGS[@]}"; do
            URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
            # 30s total: tenant just SSM-restarted, may still be coming
            # up. Retry-on-empty rather than retry-on-status — we want
            # to fail fast on "responded with wrong SHA", not "still
            # warming up".
            BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
            ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
            if [ -z "$ACTUAL_SHA" ]; then
              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
              UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
              continue
            fi
            if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
              echo "  $slug: ${ACTUAL_SHA:0:7} ✓"
            else
              STALE_COUNT=$((STALE_COUNT + 1))
              STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
            fi
          done
          {
            echo ""
            echo "### Per-tenant /buildinfo verification"
            echo ""
            echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
            echo ""
            if [ $STALE_COUNT -gt 0 ]; then
              echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${STALE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $UNREACHABLE_COUNT -gt 0 ]; then
              echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely teardown race (soft-warn, not failing):**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
              echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓"
            fi
          } >> "$GITHUB_STEP_SUMMARY"
          if [ $UNREACHABLE_COUNT -gt 0 ]; then
            echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
          fi
          # Belt-and-suspenders sanity floor: same logic as the staging
          # variant — see that file's comment for the full rationale.
          # Floor only applies when fleet >= 4; below that, staging-verify
          # is the actual gate.
          TOTAL_VERIFIED=${#SLUGS[@]}
          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
            exit 1
          fi
          if [ $STALE_COUNT -gt 0 ]; then
            echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
            exit 1
          fi
          echo "::notice::Tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."
--- a/.gitea/workflows/redeploy-tenants-on-staging.yml
+++ b/.gitea/workflows/redeploy-tenants-on-staging.yml
@ -1,352 +0,0 @@
 name: redeploy-tenants-on-staging
 # Ported from .github/workflows/redeploy-tenants-on-staging.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with
 #     push+paths filter per this PR. Gitea 1.22.6 does not support
 #     `workflow_run` (task #81). The push trigger fires on every
 #     commit to publish-workspace-server-image.yml which is the
 #     same signal (only successful runs commit to main). Removed
 #     `workflow_run.conclusion==success` job if since push implies
 #     the workflow completed and committed.
 #
 # Auto-refresh staging tenant EC2s after every staging-branch merge.
 #
 # Mirror of redeploy-tenants-on-main.yml, with the staging-CP host and
 # the :staging-latest tag. Sister workflow exists for prod (rolls
 # :latest after staging-verify). Both share the same shape — just
 # different CP_URL + target_tag + admin token secret.
 #
 # Why this workflow exists: publish-workspace-server-image now builds
 # on every staging-branch push (PR #2335), pushing
 # platform-tenant:staging-latest to GHCR. Existing tenants pulled
 # their image once at boot and never re-pull, so the new image just
 # sits unused until the tenant is reprovisioned.
 #
 # This workflow closes the gap by calling staging-CP's
 # /cp/admin/tenants/redeploy-fleet, which performs a canary-first,
 # batched, health-gated SSM redeploy across every live staging tenant.
 # Same endpoint shape as prod CP — only the host differs.
 #
 # Runtime ordering:
 #   1. publish-workspace-server-image completes on staging branch →
 #      new :staging-latest in GHCR.
 #   2. This workflow fires via workflow_run, waits 30s for GHCR's CDN
 #      to propagate the new tag.
 #   3. Calls redeploy-fleet with no canary (staging IS canary; we don't
 #      need a sub-canary inside it). Soak still applies to the first
 #      tenant in case of bad-deploy detection.
 #   4. Any failure aborts the rollout and leaves older tenants on the
 #      prior image — safer default than half-and-half state.
 #
 # Rollback path: re-run with workflow_dispatch + target_tag=staging-<sha>
 # of a known-good build.
 on:
  push:
    branches: [staging]
    paths:
      - '.gitea/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:
 permissions:
  contents: read
  # No write scopes needed — the workflow hits an external CP endpoint,
  # not the GitHub API.
 # Serialize per-branch so two rapid staging pushes' redeploys don't
 # overlap and cause confusing per-tenant SSM state. cancel-in-progress
 # is false because aborting a half-rolled-out fleet leaves tenants
 # stuck on whatever image they happened to be on when cancelled.
 concurrency:
  group: redeploy-tenants-on-staging
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  redeploy:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 25
    steps:
      - name: Wait for GHCR tag propagation
        # GHCR's edge cache takes ~15-30s to consistently serve the new
        # :staging-latest manifest after the registry accepts the push.
        # Same rationale as redeploy-tenants-on-main.yml.
        run: sleep 30
      - name: Call staging-CP redeploy-fleet
        # CP_STAGING_ADMIN_API_TOKEN must be set as a repo/org secret
        # on molecule-ai/molecule-core, matching staging-CP's
        # CP_ADMIN_API_TOKEN env var (visible in Railway controlplane
        # / staging environment). Stored separately from the prod
        # CP_ADMIN_API_TOKEN so a leak of one doesn't auth the other.
        env:
          CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
          CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
          TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
          CANARY_SLUG: ${{ inputs.canary_slug || '' }}
          SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
          BATCH_SIZE: ${{ inputs.batch_size || '3' }}
          DRY_RUN: ${{ inputs.dry_run || false }}
        run: |
          set -euo pipefail
          # Schedule-vs-dispatch hardening (mirrors sweep-cf-orphans
          # and sweep-cf-tunnels): hard-fail on auto-trigger when the
          # secret is missing so a misconfigured-repo doesn't silently
          # serve stale staging tenants. Soft-skip on operator dispatch.
          if [ -z "${CP_STAGING_ADMIN_API_TOKEN:-}" ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::CP_STAGING_ADMIN_API_TOKEN secret not set — skipping redeploy"
              echo "::warning::Set CP_STAGING_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
              echo "::notice::Pull the value from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
              exit 0
            fi
            echo "::error::staging redeploy cannot run — CP_STAGING_ADMIN_API_TOKEN secret missing"
            echo "::error::set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
            exit 1
          fi
          BODY=$(jq -nc \
            --arg tag "$TARGET_TAG" \
            --arg canary "$CANARY_SLUG" \
            --argjson soak "$SOAK_SECONDS" \
            --argjson batch "$BATCH_SIZE" \
            --argjson dry "$DRY_RUN" \
            '{
              target_tag: $tag,
              canary_slug: $canary,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          # Route -w into its own tempfile so curl's exit code (e.g. 56
          # on connection-reset) can't pollute the captured stdout. The
          # previous inline-substitution shape produced "000000" on
          # connection reset — caught on main variant 2026-05-04
          # redeploying sha 2b862f6. Same fix shape as the synth-E2E
          # §9c gate (PR #2797). See lint-curl-status-capture.yml for
          # the CI gate that pins this fix shape.
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          set -e
          # Stderr from curl (-sS shows dial errors etc.) goes to the
          # runner log so operators can see WHY a connection failed.
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          {
            echo "## Staging tenant redeploy fleet"
            echo ""
            echo "**Target tag:** \`$TARGET_TAG\`"
            echo "**Canary:** \`${CANARY_SLUG:-(none — staging is itself the canary)}\` (soak ${SOAK_SECONDS}s)"
            echo "**Batch size:** $BATCH_SIZE"
            echo "**Dry run:** $DRY_RUN"
            echo "**HTTP:** $HTTP_CODE"
            echo ""
            echo "### Per-tenant result"
            echo ""
            echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
            echo '|------|-------|------------|------|---------|-------|'
            jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
          } >> "$GITHUB_STEP_SUMMARY"
          # Distinguish "real fleet failure" from "E2E teardown race".
          #
          # CP returns HTTP 500 + ok=false whenever ANY tenant in the
          # fleet failed SSM or healthz. In practice the recurring source
          # of these is ephemeral test tenants being torn down by their
          # parent E2E run mid-redeploy: the EC2 dies → SSM exit=2 or
          # healthz timeout → CP marks the fleet failed → this workflow
          # goes red even though every operator-facing tenant rolled fine.
          #
          # Ephemeral slug prefixes (kept in sync with sweep-stale-e2e-orgs.yml
          # — see that file for the source-of-truth list and rationale):
          #   - e2e-*       — canvas/saas/ext E2E suites
          #   - rt-e2e-*    — runtime-test harness fixtures (RFC #2251)
          # Long-lived prefixes that are NOT ephemeral and MUST hard-fail:
          # demo-prep, dryrun-*, dryrun2-*, plus all human tenant slugs.
          #
          # Filter: if HTTP=500/ok=false AND every failed slug matches an
          # ephemeral prefix, treat as soft-warn and let the verify step
          # downstream handle unreachable-vs-stale (#2402). Any non-ephemeral
          # failure or a non-500 HTTP response remains a hard failure.
          OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE")
          FAILED_SLUGS=$(jq -r '
            .results[]?
            | select((.healthz_ok != true) or (.ssm_status != "Success"))
            | .slug' "$HTTP_RESPONSE" 2>/dev/null || true)
          EPHEMERAL_PREFIX_RE='^(e2e-|rt-e2e-)'
          NON_EPHEMERAL_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -Ev "$EPHEMERAL_PREFIX_RE" || true)
          if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then
            : # happy path — fall through to verification
          elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_EPHEMERAL_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then
            COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -Ec "$EPHEMERAL_PREFIX_RE" || true)
            echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is ephemeral (e2e-*/rt-e2e-*) — treating as teardown race, soft-warning."
            printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning::  failed: /'
          elif [ "$HTTP_CODE" != "200" ]; then
            echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
            if [ -n "$NON_EPHEMERAL_FAILED" ]; then
              echo "::error::non-ephemeral tenant(s) failed:"
              printf '%s\n' "$NON_EPHEMERAL_FAILED" | sed 's/^/::error::  /'
            fi
            exit 1
          else
            # HTTP=200 but ok=false (shouldn't happen with current CP
            # but keep the gate for completeness).
            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
            exit 1
          fi
          echo "::notice::Staging tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
          cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
      - name: Verify each staging tenant /buildinfo matches published SHA
        # Mirror of the verify step in redeploy-tenants-on-main.yml — see
        # there for the rationale (#2395 root fix). Staging has the same
        # ssm_status-success-but-stale-image hazard and benefits from the
        # same gate. Diff: TENANT_DOMAIN includes the `staging.` infix.
        env:
          EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
          TENANT_DOMAIN: 'staging.moleculesai.app'
        run: |
          set -euo pipefail
          # staging-latest is the staging-side moving tag; treat it the
          # same way main treats `latest`. Operator-pinned SHAs skip
          # verification (see main variant for why).
          if [ "$TARGET_TAG" != "staging-latest" ] && [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then
            echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
            exit 0
          fi
          RESP="$RUNNER_TEMP/redeploy-response.json"
          if [ ! -s "$RESP" ]; then
            echo "::error::redeploy-response.json missing or empty"
            exit 1
          fi
          mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
          if [ ${#SLUGS[@]} -eq 0 ]; then
            echo "::warning::No staging tenants reported healthz_ok — nothing to verify"
            exit 0
          fi
          echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
          # Two distinct failure modes here:
          #   STALE_COUNT      — tenant returned a SHA that doesn't match. THIS is
          #                      the #2395 bug class: tenant up + serving old code.
          #                      Always hard-fail the workflow.
          #   UNREACHABLE_COUNT — tenant didn't respond. Almost always a benign
          #                      teardown race: redeploy-fleet snapshot says
          #                      healthz_ok=true, then the E2E suite tears the
          #                      ephemeral tenant down before this step runs (the
          #                      e2e-* fixtures churn 5-10/hour on staging). Soft-
          #                      warn so we don't block staging→main on cleanup.
          #                      Real "tenant up but unreachable" is caught by CP's
          #                      own healthz monitor + the post-redeploy alert; we
          #                      don't need to double-count it here.
          STALE_COUNT=0
          UNREACHABLE_COUNT=0
          STALE_LINES=()
          UNREACHABLE_LINES=()
          for slug in "${SLUGS[@]}"; do
            URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
            BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
            ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
            if [ -z "$ACTUAL_SHA" ]; then
              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
              UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
              continue
            fi
            if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
              echo "  $slug: ${ACTUAL_SHA:0:7} ✓"
            else
              STALE_COUNT=$((STALE_COUNT + 1))
              STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
            fi
          done
          {
            echo ""
            echo "### Per-tenant /buildinfo verification (staging)"
            echo ""
            echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
            echo ""
            if [ $STALE_COUNT -gt 0 ]; then
              echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${STALE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $UNREACHABLE_COUNT -gt 0 ]; then
              echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely E2E teardown race (soft-warn, not failing):**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
              echo "All ${#SLUGS[@]} staging tenants returned matching SHA. ✓"
            fi
          } >> "$GITHUB_STEP_SUMMARY"
          if [ $UNREACHABLE_COUNT -gt 0 ]; then
            echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
          fi
          # Belt-and-suspenders sanity floor: if MORE than half the fleet is
          # unreachable AND the fleet is large enough that "half down" is
          # statistically meaningful, this is a real outage (e.g. new image
          # crashes on startup), not a teardown race. Hard-fail.
          #
          # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the
          # staging-verify step is the actual gate for "all tenants down"
          # detection (it runs against the canary first and aborts the
          # rollout if the canary fails to come up). Without the >=4 gate,
          # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a
          # quiet staging push) would re-flake on the exact teardown-race
          # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail.
          TOTAL_VERIFIED=${#SLUGS[@]}
          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
            exit 1
          fi
          if [ $STALE_COUNT -gt 0 ]; then
            echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
            exit 1
          fi
          echo "::notice::Staging tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."
--- a/.gitea/workflows/review-check-tests.yml
+++ b/.gitea/workflows/review-check-tests.yml
@ -1,70 +0,0 @@
 name: review-check-tests
 # Runs review-check.sh regression tests on every PR + push that touches
 # the evaluator script or its test fixtures.
 #
 # Follows RFC#324 follow-up (issue #540):
 #   .gitea/scripts/review-check.sh is load-bearing for PR merge gates.
 #   It has ZERO production CI coverage. This workflow closes that gap.
 #
 # Design choices:
 #   - Bash test harness (not bats). The existing test_review_check.sh
 #     uses a custom assert_eq/assert_contains framework that is already
 #     working and covers all 13 acceptance criteria (issue #540 §Acceptance).
 #     Converting to bats would be refactoring, not closing the gap.
 #   - No bats dependency: the runner-base image needs no extra tooling.
 #   - continue-on-error: false — these tests must pass; a failure means
 #     the review-gate evaluator is broken and must not be merged.
 on:
  push:
    branches: [main, staging]
    paths:
      - '.gitea/scripts/review-check.sh'
      - '.gitea/scripts/tests/test_review_check.sh'
      - '.gitea/scripts/tests/_review_check_fixture.py'
      - '.gitea/workflows/review-check-tests.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - '.gitea/scripts/review-check.sh'
      - '.gitea/scripts/tests/test_review_check.sh'
      - '.gitea/scripts/tests/_review_check_fixture.py'
      - '.gitea/workflows/review-check-tests.yml'
  workflow_dispatch:
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test:
    name: review-check.sh regression tests
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Install jq
        # Required for T12 jq-filter test case. Gitea Actions runners (ubuntu-latest
        # label) do not bundle jq. Install via apt-get first (reliable for Ubuntu
        # runners with internet access to package mirrors). Falls back to GitHub
        # binary download. GitHub releases may be blocked on some runner networks
        # (infra#241 follow-up).
        continue-on-error: true
        run: |
          if apt-get update -qq && apt-get install -y -qq jq; then
            echo "::notice::jq installed via apt-get: $(jq --version)"
          elif timeout 120 curl -sSL \
            "https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64" \
            -o /usr/local/bin/jq && chmod +x /usr/local/bin/jq; then
            echo "::notice::jq binary downloaded: $(/usr/local/bin/jq --version)"
          else
            echo "::warning::jq install failed — apt-get and GitHub download both failed."
          fi
          jq --version 2>/dev/null || echo "::notice::jq not yet available — continuing"
      - name: Run review-check.sh regression suite
        run: bash .gitea/scripts/tests/test_review_check.sh
--- a/.gitea/workflows/runtime-pin-compat.yml
+++ b/.gitea/workflows/runtime-pin-compat.yml
@ -1,100 +0,0 @@
 name: Runtime Pin Compatibility
 # Ported from .github/workflows/runtime-pin-compat.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group:` (no Gitea merge queue) and
 #     `workflow_dispatch:` (no inputs, but the trigger itself is
 #     parser-rejected when inputs are absent in some Gitea 1.22.x
 #     builds; safest to drop entirely — manual runs go via cron-trigger
 #     bump or push-with-paths-filter).
 #   - on.paths references .gitea/workflows/runtime-pin-compat.yml (this
 #     file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # CI gate that prevents the 5-hour staging outage from 2026-04-24 from
 # recurring (controlplane#253). The original failure mode:
 #   1. molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its
 #      requires_dist metadata (incorrect — it actually imports
 #      a2a.server.routes which only exists in a2a-sdk 1.0+)
 #   2. `pip install molecule-ai-workspace-runtime` resolved cleanly
 #   3. `from molecule_runtime.main import main_sync` raised ImportError
 #   4. Every tenant workspace crashed; the canary tenant caught it but
 #      only after 5 hours of degraded staging
 #
 # This workflow installs the CURRENTLY PUBLISHED runtime from PyPI on
 # top of `workspace/requirements.txt` and smoke-imports. Catches:
 #   - Upstream PyPI yanks
 #   - Bad re-releases of molecule-ai-workspace-runtime
 #   - Already-shipped wheels that stop importing because a transitive
 #     dep moved underneath
 on:
  push:
    branches: [main, staging]
    paths:
      # Narrow filter: pypi-latest is sensitive only to changes that
      # affect what we're INSTALLING (requirements.txt) or WHAT THE
      # CHECK ITSELF DOES (this workflow file). Edits to workspace/
      # source code don't change what's on PyPI right now, so they
      # don't change this gate's verdict.
      - 'workspace/requirements.txt'
      - '.gitea/workflows/runtime-pin-compat.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - 'workspace/requirements.txt'
      - '.gitea/workflows/runtime-pin-compat.yml'
  # Daily catch for upstream PyPI publishes that break the pin combo
  # without any change in our repo (e.g. someone re-yanks an a2a-sdk
  # release or molecule-ai-workspace-runtime publishes a bad bump).
  schedule:
    - cron: '0 13 * * *'  # 06:00 PT
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  pypi-latest-install:
    name: PyPI-latest install + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - name: Install runtime + workspace requirements
        # Install order is load-bearing: install the runtime FIRST so pip
        # honors whatever a2a-sdk constraint the runtime metadata declares
        # (this is the surface that broke in 2026-04-24 — runtime declared
        # `a2a-sdk<1.0` but actually needed >=1.0). The follow-up install
        # of workspace/requirements.txt then upgrades a2a-sdk to the
        # constraint our runtime image actually pins. The import smoke
        # below verifies the upgraded combination is consistent.
        run: |
          python -m venv /tmp/venv
          /tmp/venv/bin/pip install --upgrade pip
          /tmp/venv/bin/pip install molecule-ai-workspace-runtime
          /tmp/venv/bin/pip install -r workspace/requirements.txt
          /tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
            | grep -E '^(Name|Version):'
      - name: Smoke import — fail if metadata declares deps that don't satisfy real imports
        # WORKSPACE_ID is validated at import time by platform_auth.py — EC2
        # user-data sets it from the cloud-init template; set a placeholder
        # here so the import smoke doesn't trip on the env-var guard.
        env:
          WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
        run: |
          /tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')"
--- a/.gitea/workflows/runtime-prbuild-compat.yml
+++ b/.gitea/workflows/runtime-prbuild-compat.yml
@ -1,139 +0,0 @@
 name: Runtime PR-Built Compatibility
 # Ported from .github/workflows/runtime-prbuild-compat.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group:` (no Gitea merge queue) and `workflow_dispatch:`
 #     (Gitea 1.22.6 parser-rejects workflow_dispatch with inputs and is
 #     finicky without them).
 #   - `dorny/paths-filter@v4` replaced with inline `git diff` (per PR#372
 #     pattern for ci.yml port).
 #   - on.paths references .gitea/workflows/runtime-prbuild-compat.yml.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on every job (RFC §1 contract).
 #
 # Companion to `runtime-pin-compat.yml`. That workflow tests what's
 # CURRENTLY PUBLISHED on PyPI; this workflow tests what WOULD BE
 # PUBLISHED if THIS PR merges.
 #
 # Why two workflows: the chicken-and-egg #128 fix added a "PR-built
 # wheel" job to the original runtime-pin-compat.yml, but both jobs
 # shared a `paths:` filter that was the union of their needs
 # (`workspace/**`). That meant the PyPI-latest job ran on every doc
 # edit even though the upstream PyPI artifact can't change with our
 # workspace/ source. Splitting the two means each gets a narrow
 # `paths:` filter that matches the inputs it actually depends on.
 #
 # Catches the failure mode where a PR adds an import requiring a newer
 # SDK than `workspace/requirements.txt` pins:
 #   1. Pip resolves the existing PyPI wheel + the old SDK pin -> smoke
 #      passes (it imports the OLD main.py from the wheel, not the PR's
 #      new main.py).
 #   2. Merge -> publish-runtime.yml ships a wheel WITH the new import.
 #   3. Tenant images redeploy -> all crash on first boot with ImportError.
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  # event_name + sha keeps PR sync and the subsequent staging push on the
  # same SHA from cancelling each other (per feedback_concurrency_group_per_sha).
  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: true
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      wheel: ${{ steps.decide.outputs.wheel }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: decide
        run: |
          # Inline replacement for dorny/paths-filter — same pattern
          # PR#372's ci.yml port used. Diffs against the PR base or the
          # previous push SHA, then matches against the wheel-relevant
          # path set.
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            # New branch or no previous SHA: treat as wheel-relevant.
            echo "wheel=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "wheel=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(workspace/|scripts/build_runtime_package\.py$|scripts/wheel_smoke\.py$|\.gitea/workflows/runtime-prbuild-compat\.yml$)'; then
            echo "wheel=true" >> "$GITHUB_OUTPUT"
          else
            echo "wheel=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job (no job-level `if:`) that always runs and reports under the
  # required-check name `PR-built wheel + import smoke`. Real work is
  # gated per-step on `needs.detect-changes.outputs.wheel`.
  local-build-install:
    needs: detect-changes
    name: PR-built wheel + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.wheel != 'true'
        run: |
          echo "No workspace/ / scripts/{build_runtime_package,wheel_smoke}.py / workflow changes — wheel gate satisfied without rebuilding."
          echo "::notice::PR-built wheel + import smoke no-op pass (paths filter excluded this commit)."
      - if: needs.detect-changes.outputs.wheel == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.detect-changes.outputs.wheel == 'true'
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - name: Install build tooling
        if: needs.detect-changes.outputs.wheel == 'true'
        run: pip install build
      - name: Build wheel from PR source (mirrors publish-runtime.yml)
        if: needs.detect-changes.outputs.wheel == 'true'
        # Use a fixed test version so the wheel filename is predictable.
        # Doesn't reach PyPI — this build is local-only for the smoke.
        run: |
          python scripts/build_runtime_package.py \
            --version "0.0.0.dev0+pin-compat" \
            --out /tmp/runtime-build
          cd /tmp/runtime-build && python -m build
      - name: Install built wheel + workspace requirements
        if: needs.detect-changes.outputs.wheel == 'true'
        run: |
          python -m venv /tmp/venv-built
          /tmp/venv-built/bin/pip install --upgrade pip
          /tmp/venv-built/bin/pip install /tmp/runtime-build/dist/*.whl
          /tmp/venv-built/bin/pip install -r workspace/requirements.txt
          /tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
            | grep -E '^(Name|Version):'
      - name: Smoke import the PR-built wheel
        if: needs.detect-changes.outputs.wheel == 'true'
        # Same script publish-runtime.yml runs against the to-be-PyPI wheel.
        run: |
          /tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
--- a/.gitea/workflows/secret-pattern-drift.yml
+++ b/.gitea/workflows/secret-pattern-drift.yml
@ -1,70 +0,0 @@
 name: SECRET_PATTERNS drift lint
 # Ported from .github/workflows/secret-pattern-drift.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths references the new canonical .gitea/workflows/secret-scan.yml
 #     (the .github/ copy is removed by Cat A of this sweep).
 #   - CANONICAL_FILE inside scripts/lint_secret_pattern_drift.py was
 #     updated in the same Cat C-1 PR to point at .gitea/workflows/secret-scan.yml.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Detects when the canonical SECRET_PATTERNS array in
 # .gitea/workflows/secret-scan.yml diverges from known consumer
 # mirrors (workspace-runtime's bundled pre-commit hook today; more
 # can be added as the consumer set grows).
 #
 # Why this exists: every side that scans for credentials has its own
 # copy of the pattern list. They drift — most recently the runtime
 # hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088),
 # so a developer's local pre-commit would let a sk-cp- token through
 # while the org-wide CI scan would refuse it. The cost of that drift
 # is dev confusion + delayed feedback; the fix is automated detection.
 #
 # Triggers:
 #   - schedule: daily 05:00 UTC. Catches drift introduced by edits
 #     to a consumer copy that didn't update canonical here.
 #   - push to main/staging where the canonical or this lint changed:
 #     catches the inverse — canonical updated but consumers not yet
 #     bumped. The lint will fail the push; that's intentional.
 on:
  schedule:
    # 05:00 UTC = 22:00 PT / 01:00 ET. Quiet hours so a failure
    # email lands when humans are starting their day, not
    # interrupting it.
    - cron: "0 5 * * *"
  push:
    branches: [main, staging]
    paths:
      - ".gitea/workflows/secret-scan.yml"
      - ".gitea/workflows/secret-pattern-drift.yml"
      - ".github/scripts/lint_secret_pattern_drift.py"
      - ".githooks/pre-commit"
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 # Auto-injected GITHUB_TOKEN scoped to read-only. The lint only does git
 # checkout + HTTPS GETs to public consumer files; no writes to anything.
 permissions:
  contents: read
 jobs:
  lint:
    name: Detect SECRET_PATTERNS drift
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
      - name: Run drift lint
        run: python3 .github/scripts/lint_secret_pattern_drift.py
--- a/.gitea/workflows/security-review.yml
+++ b/.gitea/workflows/security-review.yml
@ -1,72 +0,0 @@
 # security-review — non-author APPROVE from the `security` Gitea team
 # required to merge.
 #
 # RFC#324 Step 1 of 5 (workflow-add). Mirror of `qa-review.yml`; differs
 # only in TEAM=security, TEAM_ID=21, and the slash-command name.
 #
 # See `qa-review.yml` header for the full A1-α / A1.1 / A4 / A5 design
 # rationale; everything below is identical in shape.
 name: security-review
 on:
  pull_request_target:
    types: [opened, synchronize, reopened]
  issue_comment:
    types: [created]
 permissions:
  contents: read
  pull-requests: read
 jobs:
  approved:
    # See qa-review.yml header for full A1-α / A1.1 (v1.3 — informational
    # log only, NOT a gate) / A4 / A5 design rationale.
    if: |
      github.event_name == 'pull_request_target' ||
      (github.event_name == 'issue_comment' &&
       github.event.issue.pull_request != null &&
       startsWith(github.event.comment.body, '/security-recheck'))
    runs-on: ubuntu-latest
    steps:
      - name: Privilege check (A1.1 — INFORMATIONAL log only, NOT a gate)
        # RFC#324 v1.3 §A1.1: does NOT gate subsequent steps. See
        # qa-review.yml for full rationale. Eval is read-only/idempotent
        # so re-running on a non-collaborator comment is harmless.
        if: github.event_name == 'issue_comment'
        env:
          GITEA_TOKEN: ${{ secrets.RFC_324_TEAM_READ_TOKEN || secrets.GITHUB_TOKEN }}
        run: |
          set -euo pipefail
          login="${{ github.event.comment.user.login }}"
          # Write token to a mode-600 file so it never appears in curl's argv.
          # (#541: -H "Authorization: token $TOKEN" puts the secret in /proc/<pid>/cmdline)
          authfile=$(mktemp)
          chmod 600 "$authfile"
          printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$authfile"
          code=$(curl -sS -o /dev/null -w '%{http_code}' -K "$authfile" \
            "${{ github.server_url }}/api/v1/repos/${{ github.repository }}/collaborators/${login}")
          rm -f "$authfile"
          if [ "$code" = "204" ]; then
            echo "::notice::Recheck from ${login} (collaborator=true)"
          else
            echo "::notice::Recheck from ${login} (collaborator=false, HTTP ${code}) — proceeding with read-only eval anyway"
          fi
      - name: Check out BASE ref (A4 — never PR-head)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ github.event.repository.default_branch }}
      - name: Evaluate security-review
        env:
          GITEA_TOKEN: ${{ secrets.RFC_324_TEAM_READ_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }}
          TEAM: security
          TEAM_ID: '21'
          REVIEW_CHECK_DEBUG: '0'
          REVIEW_CHECK_STRICT: '0'
        run: bash .gitea/scripts/review-check.sh
--- a/.gitea/workflows/sop-checklist-gate.yml
+++ b/.gitea/workflows/sop-checklist-gate.yml
@ -1,121 +0,0 @@
 # sop-checklist-gate — peer-ack merge gate for SOP-checklist items.
 #
 # RFC#351 Step 2 of 6 (implementation MVP).
 #
 # === DESIGN ===
 #
 # Goal: each PR must answer 7 SOP-checklist questions in its body,
 # and each item must have at least one /sop-ack <slug> comment from
 # a non-author peer in the required team. BP requires the
 # `sop-checklist / all-items-acked (pull_request)` status to merge.
 #
 # Triggers:
 #   - `pull_request_target`: opened, edited, synchronize, reopened
 #       → fires when PR opens, body is edited (refire — RFC#351 §4),
 #         or new code is pushed (head.sha changes → stale status would
 #         be auto-discarded by BP via dismiss_stale_reviews, but the
 #         status itself is per-SHA so we re-post on the new head).
 #   - `issue_comment`: created, edited, deleted
 #       → fires on any new comment so /sop-ack / /sop-revoke take
 #         effect immediately (Gitea 1.22.6 doesn't refire on
 #         pull_request_review per feedback_pull_request_review_no_refire,
 #         so issue_comment is the canonical refire channel).
 #
 # Trust boundary (mirrors RFC#324 §A4 + sop-tier-check security note):
 #   `pull_request_target` (not `pull_request`) — workflow def is loaded
 #   from BASE branch, so a PR cannot rewrite this workflow to exfiltrate
 #   the token. The `actions/checkout` step pins `ref: base.sha` so the
 #   script ALSO comes from BASE. PR-HEAD code is never executed in the
 #   runner.
 #
 # Token scope:
 #   - read:repository, read:organization for PR + comments + team probes
 #   - write:repository for POST /statuses/{sha}
 #   - The token owner MUST be a member of every team referenced by the
 #     config's required_teams (else /teams/{id}/members/{login} returns
 #     403 — see review-check.sh same-gotcha doc). For the MVP we use
 #     the dev-lead token (a member of engineers, managers, qa, security)
 #     via a repo secret `SOP_CHECKLIST_GATE_TOKEN`. Provisioning of that
 #     secret is a follow-up authorization step (separate from this PR).
 #
 # Failure mode: tier-aware (RFC#351 open question 2):
 #   - tier:high   → state=failure (hard-fail; BP blocks merge)
 #   - tier:medium → state=failure (hard-fail; same)
 #   - tier:low    → state=pending (soft-fail; BP can choose to require
 #                    this context or skip for low-tier PRs)
 #   - missing/no-tier → state=failure (default-mode: hard — never lower
 #                    the bar per feedback_fix_root_not_symptom)
 #
 # Slash-command contract (RFC#351 v1 + §A1.1-style notes from RFC#324):
 #
 #   /sop-ack <slug-or-numeric-alias> [optional note]
 #       — register a peer-ack for one checklist item.
 #       — slug accepts kebab-case, snake_case, or natural-spaces
 #         (all normalize to canonical kebab-case).
 #       — numeric 1..7 maps via config.items[*].numeric_alias.
 #       — most-recent (user, slug) directive wins.
 #
 #   /sop-revoke <slug-or-numeric-alias> [reason]
 #       — invalidate the commenter's own prior /sop-ack for this slug.
 #       — does NOT affect other peers' acks on the same slug.
 #       — most-recent (user, slug) directive wins, so a later /sop-ack
 #         re-restores the ack.
 #
 # The eval is read-only + idempotent (read PR + comments + team
 # membership, compute, post status). Re-running on any event is safe —
 # the new status overwrites the previous one for the same context.
 name: sop-checklist-gate
 on:
  pull_request_target:
    types: [opened, edited, synchronize, reopened]
  issue_comment:
    types: [created, edited, deleted]
 permissions:
  contents: read
  pull-requests: read
  # NOTE: `statuses: write` is the GitHub-Actions name for POST /statuses.
  # Gitea 1.22.6 may not gate on this permission key (it just checks the
  # token), but listing it explicitly documents intent for the next
  # platform-version upgrade.
  statuses: write
 jobs:
  gate:
    # Run on pull_request_target events always. On issue_comment events,
    # only when the comment is on a PR (issue_comment fires for issues
    # too) and the body contains one of the slash-commands.
    if: |
      github.event_name == 'pull_request_target' ||
      (github.event_name == 'issue_comment' &&
       github.event.issue.pull_request != null &&
       (contains(github.event.comment.body, '/sop-ack') ||
        contains(github.event.comment.body, '/sop-revoke')))
    runs-on: ubuntu-latest
    steps:
      - name: Check out BASE ref (trust boundary — never PR-head)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # For pull_request_target, the default branch is the trust
          # anchor. For issue_comment the PR base may differ from the
          # default branch (PR targeting `staging`), so we use the
          # default-branch ref explicitly — same approach as
          # qa-review.yml so the script source is always trusted.
          ref: ${{ github.event.repository.default_branch }}
      - name: Run sop-checklist-gate
        env:
          GITEA_TOKEN: ${{ secrets.SOP_CHECKLIST_GATE_TOKEN || secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }}
          OWNER: ${{ github.repository_owner }}
          REPO_NAME: ${{ github.event.repository.name }}
        run: |
          set -euo pipefail
          python3 .gitea/scripts/sop-checklist-gate.py \
            --owner "$OWNER" \
            --repo "$REPO_NAME" \
            --pr "$PR_NUMBER" \
            --config .gitea/sop-checklist-config.yaml \
            --gitea-host git.moleculesai.app
--- a/.gitea/workflows/sop-tier-check.yml
+++ b/.gitea/workflows/sop-tier-check.yml
@ -1,126 +0,0 @@
 # sop-tier-check — canonical Gitea Actions workflow for §SOP-6 enforcement.
 #
 # Logic lives in `.gitea/scripts/sop-tier-check.sh` (extracted 2026-05-09
 # from the previous inline-bash version). The script is the single source
 # of truth; this workflow file just sets env + invokes it.
 #
 # Copy BOTH files (`.gitea/workflows/sop-tier-check.yml` +
 # `.gitea/scripts/sop-tier-check.sh`) into any repo that wants the
 # §SOP-6 PR gate enforced. Pair with branch protection on the protected
 # branch:
 #   required_status_checks:    ["sop-tier-check / tier-check (pull_request)"]
 #   required_approving_reviews: 1
 #   approving_review_teams:    ["ceo", "managers", "engineers"]
 #
 # Tier → required-team expression (internal#189 AND-composition):
 #   tier:low    → engineers,managers,ceo        (OR: any one suffices)
 #   tier:medium → managers AND engineers AND qa???,security???  (AND: all required)
 #   tier:high   → ceo                           (OR: single team, wired for AND)
 #
 # "???" = teams not yet created in Gitea. When qa + security teams are
 # added, update TIER_EXPR["tier:medium"] in the script to remove the
 # markers. PRs already in-flight when qa/security are created continue
 # to work because their authors explicitly requested those reviews.
 #
 # Force-merge: Owners-team override remains available out-of-band via
 # the Gitea merge API; force-merge writes `incident.force_merge` to
 # `structure_events` per §Persistent structured logging gate (Phase 3).
 #
 # Environment variables:
 #   SOP_DEBUG=1          — per-API-call diagnostic lines. Default: off.
 #   SOP_LEGACY_CHECK=1   — revert to OR-gate for this run. Grace window
 #                           for PRs in-flight when AND-composition deployed.
 #                           Burn-in: remove after 2026-05-17 (7-day window).
 #
 # BURN-IN NOTE (internal#189 Phase 1): continue-on-error: true is set on
 # the tier-check job below. This prevents AND-composition from blocking
 # PRs during the 7-day burn-in. After 2026-05-17:
 #   1. Remove `continue-on-error: true` from this job block.
 #   2. Update this BURN-IN NOTE comment to mark the window closed.
 name: sop-tier-check
 # SECURITY: triggers MUST use `pull_request_target`, not `pull_request`.
 # `pull_request_target` loads the workflow definition from the BASE
 # branch (i.e. `main`), not the PR's HEAD. With `pull_request`, anyone
 # with write access to a feature branch could rewrite this file in
 # their PR to dump SOP_TIER_CHECK_TOKEN (org-read scope) to logs and
 # exfiltrate it. Verified 2026-05-09 against Gitea 1.22.6 —
 # `pull_request_target` (added in Gitea 1.21 via go-gitea/gitea#25229)
 # is the documented mitigation.
 #
 # This workflow does NOT call `actions/checkout` of PR HEAD code, so no
 # untrusted code is ever executed in the runner — we only HTTP-call the
 # Gitea API. If a future change adds a checkout step, it MUST pin to
 # `${{ github.event.pull_request.base.sha }}` (NOT `head.sha`) to keep
 # the trust boundary.
 on:
  pull_request_target:
    types: [opened, edited, synchronize, reopened, labeled, unlabeled]
  pull_request_review:
    types: [submitted, dismissed, edited]
 jobs:
  tier-check:
    runs-on: ubuntu-latest
    # BURN-IN: continue-on-error prevents AND-composition from blocking
    # PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
    continue-on-error: true
    permissions:
      contents: read
      pull-requests: read
    steps:
      - name: Check out base branch (for the script)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # Pin to base.sha — pull_request_target's protection only
          # works if we never check out PR HEAD. Same SHA the workflow
          # itself was loaded from.
          ref: ${{ github.event.pull_request.base.sha }}
      - name: Install jq
        # Gitea Actions runners (ubuntu-latest label) do not bundle jq.
        # The sop-tier-check script uses jq for all JSON API parsing.
        # Install jq before the script runs so sop-tier-check can pass.
        #
        # Method: apt-get first (reliable for Ubuntu runners with internet
        # access to package mirrors). Falls back to GitHub binary download.
        # GitHub releases may be unreachable from some runner networks
        # (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
        # runners). The sop-tier-check script has its own fallback as a
        # third line of defense. continue-on-error: true ensures this step
        # failing does not block the job.
        continue-on-error: true
        run: |
          # apt-get is the primary method — Ubuntu package mirrors are reliably
          # reachable from runner containers. GitHub releases may be blocked
          # or slow on some networks (infra#241 follow-up).
          if apt-get update -qq && apt-get install -y -qq jq; then
            echo "::notice::jq installed via apt-get: $(jq --version)"
          elif timeout 120 curl -sSL \
            "https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64" \
            -o /usr/local/bin/jq && chmod +x /usr/local/bin/jq; then
            echo "::notice::jq binary downloaded: $(/usr/local/bin/jq --version)"
          else
            echo "::warning::jq install failed — apt-get and GitHub download both failed."
          fi
          jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"
      - name: Verify tier label + reviewer team membership
        # continue-on-error: true at step level — job-level is ignored by Gitea
        # Actions (quirk #10, internal runbooks). Belt-and-suspenders with
        # SOP_FAIL_OPEN=1 + || true below.
        continue-on-error: true
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
          SOP_DEBUG: '0'
          SOP_LEGACY_CHECK: '0'
          # SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
          # the actual merge gate. Combined with continue-on-error: true
          # above, this step never fails the job regardless of script exit.
          SOP_FAIL_OPEN: '1'
        run: |
          bash .gitea/scripts/sop-tier-check.sh || true
--- a/.gitea/workflows/sop-tier-refire.yml
+++ b/.gitea/workflows/sop-tier-refire.yml
@ -1,79 +0,0 @@
 # sop-tier-refire — issue_comment-triggered refire of sop-tier-check.
 #
 # Closes internal#292. Gitea 1.22.6 doesn't refire workflows on the
 # `pull_request_review` event (go-gitea/gitea#33700); the `sop-tier-check`
 # workflow's review-event subscription is silently dead. The result:
 # PRs that get their approving review AFTER the tier-check ran on open/
 # synchronize keep their failing status check forever, and the only way
 # to merge is the admin force-merge path (audited via `audit-force-merge`
 # but the audit trail keeps growing; see `feedback_never_admin_merge_bypass`).
 #
 # Workaround pattern from `feedback_pull_request_review_no_refire`:
 # `issue_comment` events DO fire reliably on 1.22.6. When a repo
 # MEMBER/OWNER/COLLABORATOR comments `/refire-tier-check` on a PR, this
 # workflow re-runs the sop-tier-check logic and POSTs the resulting
 # status to the PR head SHA directly. No empty commit, no git history
 # bloat, no cascade re-fire of every other workflow on the PR.
 #
 # SECURITY MODEL:
 #
 # 1. `pull_request` exists on the issue (issue_comment fires on issues
 #    AND PRs; we only want PRs).
 # 2. `comment.author_association` must be MEMBER/OWNER/COLLABORATOR.
 #    Per the internal#292 core-security review (review#1066 ask): anyone
 #    can comment, but only repo collaborators+ can flip the status.
 #    Without this gate, a drive-by commenter on a public-issue-tracker
 #    surface could trigger a status flip.
 # 3. Comment body must contain `/refire-tier-check` — a slash-command-
 #    shaped trigger (not just any comment word). Prevents accidental
 #    triggering from prose like "we should refire tests" in a review.
 # 4. This workflow does NOT check out PR HEAD code. Like sop-tier-check,
 #    it only HTTP-calls the Gitea API. Trust boundary preserved.
 #
 # Note: `issue_comment` fires from the BASE branch's workflow file. There
 # is no `pull_request_target` equivalent to set; the trigger inherently
 # loads the workflow from the default branch.
 #
 # Rate-limit: a 1s pre-sleep + a "skip if status posted in last 30s"
 # guard prevents comment-spam from thrashing the status. See the script.
 name: sop-tier-check refire (issue_comment)
 on:
  issue_comment:
    types: [created]
 jobs:
  refire:
    # Three gates, all required:
    #   - comment is on a PR (not a plain issue)
    #   - commenter is MEMBER, OWNER, or COLLABORATOR
    #   - comment body contains the slash-command trigger
    if: |
      github.event.issue.pull_request != null &&
      contains(fromJson('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association) &&
      contains(github.event.comment.body, '/refire-tier-check')
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: read
      statuses: write
    steps:
      - name: Check out base branch (for the script)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # Load the script from the default branch (main), matching the
          # sop-tier-check.yml security model.
          ref: ${{ github.event.repository.default_branch }}
      - name: Re-evaluate sop-tier-check and POST status
        env:
          # Same org-level secret sop-tier-check.yml + audit-force-merge.yml use.
          # Fallback to GITHUB_TOKEN with a clear error if missing.
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.issue.number }}
          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
          # Set to '1' for diagnostic per-API-call output. Off by default.
          SOP_DEBUG: '0'
        run: bash .gitea/scripts/sop-tier-refire.sh
--- a/.gitea/workflows/staging-smoke.yml
+++ b/.gitea/workflows/staging-smoke.yml
@ -1,346 +0,0 @@
 name: Staging SaaS smoke (every 30 min)
 # Renamed from canary-staging.yml on 2026-05-11 per Hongming directive
 # ("canary naming changed to staging for all"). Originally ported from
 # .github/workflows/canary-staging.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Minimum viable health check: provisions one Hermes workspace on a fresh
 # staging org, sends one A2A message, verifies PONG, tears down. ~8 min
 # wall clock. Pages on failure by opening a GitHub issue; auto-closes the
 # issue on the next green run.
 #
 # The full-SaaS workflow (e2e-staging-saas.yml) covers the broader surface
 # but runs only on provisioning-critical pushes + nightly — this one
 # catches drift in the 30-min window between those runs (AMI health, CF
 # cert rotation, WorkOS session stability, etc.).
 #
 # Lean mode: E2E_MODE=smoke skips the child workspace + HMA memory +
 # peers/activity checks. One parent workspace + one A2A turn is enough
 # to signal "SaaS stack end-to-end is alive."
 on:
  schedule:
    # Every 30 min. Cron on GitHub-hosted runners has a known drift of
    # a few minutes under load — that's fine for a smoke check.
    - cron: '*/30 * * * *'
 # Serialise with the full-SaaS workflow so they don't contend for the
 # same org-create quota on staging. Different group key from
 # e2e-staging-saas since we don't mind queueing smoke runs behind one
 # full run, but two smoke runs SHOULD queue against each other.
 concurrency:
  group: staging-smoke
  cancel-in-progress: false
 permissions:
  # Needed to open / close the alerting issue.
  issues: write
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  smoke:
    name: Staging SaaS smoke
    runs-on: ubuntu-latest
    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
    # 2026-05-11. The "surface broken workflows without blocking"
    # rationale was correctly applied to advisory/lint workflows but
    # wrong for this smoke — it is the 30-min canary cadence for the
    # entire staging SaaS stack, and silent failure here masks the
    # exact regressions the smoke exists to surface (AMI rot, CF cert
    # drift, WorkOS session breakage, secret rotations). Same class of
    # failure as PR#461 (`sweep-stale-e2e-orgs`) where Phase-3 silent
    # failure leaked EC2. The four other `e2e-staging-*` workflows
    # KEEP `continue-on-error: true` per RFC #219 §1 — they are
    # advisory and matrix-style; this one is the canary. A follow-up
    # `notify-failure` step below also surfaces breakage to ops even
    # if branch-protection wiring is adjusted to keep this off the
    # required-checks list.
    # 25 min headroom over the 15-min TLS-readiness deadline in
    # tests/e2e/test_staging_full_saas.sh (#2107). Without the buffer
    # the job is killed at the wall-clock 15:00 mark BEFORE the bash
    # `fail` + diagnostic burst can fire, leaving every cancellation
    # silent. Sibling staging E2E jobs run at 20-45 min — keeping the
    # smoke tighter than them so a true wedge still surfaces here
    # first.
    timeout-minutes: 25
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # MiniMax is the smoke's PRIMARY LLM auth path post-2026-05-04.
      # Switched from hermes+OpenAI after #2578 (the staging OpenAI key
      # account went over quota and stayed dead for 36+ hours, taking
      # the smoke red the entire time). claude-code template's
      # `minimax` provider routes ANTHROPIC_BASE_URL to
      # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot —
      # ~5-10x cheaper per token than gpt-4.1-mini AND on a separate
      # billing account, so OpenAI quota collapse no longer wedges the
      # smoke. Mirrors the migration continuous-synth-e2e.yml made on
      # 2026-05-03 (#265) for the same reason. tests/e2e/test_staging_
      # full_saas.sh branches SECRETS_JSON on which key is present —
      # MiniMax wins when set.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      # Direct-Anthropic alternative for operators who don't want to
      # set up a MiniMax account (priority below MiniMax — first
      # non-empty wins in test_staging_full_saas.sh's secrets-injection
      # block). See #2578 PR comment for the rationale.
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      # OpenAI fallback — kept wired so an operator-dispatched run with
      # E2E_RUNTIME=hermes overridden via workflow_dispatch can still
      # exercise the OpenAI path without re-editing the workflow.
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
      E2E_MODE: smoke
      E2E_RUNTIME: claude-code
      # Pin the smoke to a specific MiniMax model rather than relying
      # on the per-runtime default (which could resolve to "sonnet" →
      # direct Anthropic and defeat the cost saving). M2.7-highspeed
      # is "Token Plan only" but cheap-per-token and fast.
      E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
      E2E_RUN_ID: "smoke-${{ github.run_id }}"
      # Debug-only: when an operator dispatches with keep_on_failure=true,
      # the smoke script's E2E_KEEP_ORG=1 path skips teardown so the
      # tenant org + EC2 stay alive for SSM-based log capture. Cron runs
      # never set this (the input only exists on workflow_dispatch) so
      # unattended cron always tears down. See molecule-core#129
      # failure mode #1 — capturing the actual exception requires
      # docker logs from the live container.
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
            exit 2
          fi
      - name: Verify LLM key present
        run: |
          # Per-runtime key check — claude-code uses MiniMax; hermes /
          # langgraph (operator-dispatched only) use OpenAI. Hard-fail
          # rather than soft-skip per the lesson from synth E2E #2578:
          # an empty key silently falls through to the wrong
          # SECRETS_JSON branch and the smoke fails 5 min later with
          # a confusing auth error instead of the clean "secret
          # missing" message at the top.
          case "${E2E_RUNTIME}" in
            claude-code)
              # Either MiniMax OR direct-Anthropic works — first
              # non-empty wins in the test script's secrets-injection
              # priority chain. Operators only need to set ONE of these
              # secrets; we don't force a choice between them.
              if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
                required_secret_value="${E2E_MINIMAX_API_KEY}"
              elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value="${E2E_ANTHROPIC_API_KEY}"
              else
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value=""
              fi
              ;;
            langgraph|hermes)
              required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
              required_secret_value="${E2E_OPENAI_API_KEY:-}"
              ;;
            *)
              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
              required_secret_name=""
              required_secret_value="present"
              ;;
          esac
          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
            echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — A2A will fail at request time with 'No LLM provider configured'"
            exit 2
          fi
          echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
      - name: Smoke run
        id: smoke
        run: bash tests/e2e/test_staging_full_saas.sh
      # Alerting: open a sticky issue on the FIRST failure; comment on
      # subsequent failures; auto-close on next green. Comment-on-existing
      # de-duplicates so a single open issue accumulates the streak —
      # ops sees one issue with N comments rather than N issues.
      #
      # Why no consecutive-failures threshold (e.g., wait 3 runs before
      # filing): the prior threshold check used
      # `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does
      # not expose (returns 404). On Gitea Actions the threshold call
      # ALWAYS failed, breaking the entire alerting step and going days
      # silent on real regressions (38h+ chronic red on 2026-05-07/08
      # before this fix; tracked in molecule-core#129). Filing on first
      # failure is also better UX — we want to know about the first red,
      # not wait 90 min for it to "count." Real flakes get one issue +
      # a quick close-on-green; persistent reds accumulate comments.
      - name: Open issue on failure (Gitea API)
        if: failure()
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          # Title kept stable across the canary-staging.yml → staging-smoke.yml
          # rename (2026-05-11) so any open alert issue from the old name
          # still title-matches and auto-closes on the next green run.
          TITLE="Canary failing: staging SaaS smoke"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
          if [ -n "$EXISTING" ]; then
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${EXISTING}/comments" \
              -d "$(jq -nc --arg run "$RUN_URL" '{body: ("Smoke still failing. " + $run)}')" >/dev/null
            echo "Commented on existing issue #${EXISTING}"
          else
            NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
            BODY=$(jq -nc --arg t "$TITLE" --arg now "$NOW" --arg run "$RUN_URL" \
              '{title: $t, body: ("Smoke run failed at " + $now + ".\n\nRun: " + $run + "\n\nThis issue auto-closes on the next green smoke run. Consecutive failures add a comment here rather than a new issue.")}')
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues" -d "$BODY" >/dev/null
            echo "Opened smoke failure issue (first red)"
          fi
      - name: Auto-close smoke issue on success (Gitea API)
        if: success()
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          # Title kept stable across the canary-staging.yml → staging-smoke.yml
          # rename so open alert issues from the old name still match.
          TITLE="Canary failing: staging SaaS smoke"
          NUMS=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number')
          NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
          for N in $NUMS; do
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}/comments" \
              -d "$(jq -nc --arg now "$NOW" '{body: ("Smoke recovered at " + $now + ". Closing.")}')" >/dev/null
            curl -fsS -X PATCH -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}" -d '{"state":"closed"}' >/dev/null
            echo "Closed recovered smoke issue #${N}"
          done
      - name: Teardown safety net
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          # Slug prefix matches what test_staging_full_saas.sh emits
          # in smoke mode:
          #   SLUG="e2e-smoke-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
          # Earlier (pre-2026-05-11 canary→staging rename) the prefix was
          # `e2e-canary-`; both prefixes are matched here for one
          # release cycle so cleanup still catches any in-flight org
          # provisioned under the old prefix on an older runner that
          # hasn't picked up the renamed script. Remove the canary
          # fallback after one week of no-old-prefix observations.
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys, os, datetime
          run_id = os.environ.get('GITHUB_RUN_ID', '')
          d = json.load(sys.stdin)
          # Scope to slugs from THIS smoke run when GITHUB_RUN_ID is
          # available; the smoke workflow sets E2E_RUN_ID='smoke-\${run_id}'
          # so the slug suffix is '-smoke-\${run_id}-...'. Mirrors the
          # full-mode safety net's per-run scoping (e2e-staging-saas.yml)
          # added after the 2026-04-21 cross-run cleanup incident.
          # Sweep both today AND yesterday's UTC dates so a run that
          # crosses midnight still cleans up its own slug — see the
          # 2026-04-26→27 canvas-safety-net incident.
          today = datetime.date.today()
          yesterday = today - datetime.timedelta(days=1)
          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
          if run_id:
              prefixes = tuple(f'e2e-smoke-{d}-smoke-{run_id}' for d in dates) \
                       + tuple(f'e2e-canary-{d}-canary-{run_id}' for d in dates)
          else:
              prefixes = tuple(f'e2e-smoke-{d}-' for d in dates) \
                       + tuple(f'e2e-canary-{d}-' for d in dates)
          candidates = [o['slug'] for o in d.get('orgs', [])
                        if any(o.get('slug','').startswith(p) for p in prefixes)
                        and o.get('status') not in ('purged',)]
          print('\n'.join(candidates))
          " 2>/dev/null)
          # Per-slug DELETE with HTTP-code verification. The previous
          # `... >/dev/null || true` swallowed every failure, so a 5xx
          # or timeout from CP looked identical to "successfully cleaned
          # up" and the tenant kept eating ~2 vCPU until the hourly
          # stale sweep caught it (up to 2h later). Now we capture the
          # response code and surface non-2xx as a workflow warning, so
          # the run page shows which slug leaked. We still don't `exit 1`
          # on cleanup failure — a single-smoke cleanup miss shouldn't
          # fail-flag the smoke itself when the actual smoke check
          # passed. The sweep-stale-e2e-orgs cron (now every 15 min,
          # 30-min threshold) is the safety net for whatever slips past.
          # See molecule-controlplane#420.
          leaks=()
          for slug in $orgs; do
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/smoke-cleanup.out -w "%{http_code}" \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/smoke-cleanup.code
            set -e
            code=$(cat /tmp/smoke-cleanup.code 2>/dev/null || echo "000")
            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
              echo "[teardown] deleted $slug (HTTP $code)"
            else
              echo "::warning::smoke teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/smoke-cleanup.out 2>/dev/null)"
              leaks+=("$slug")
            fi
          done
          if [ ${#leaks[@]} -gt 0 ]; then
            echo "::warning::smoke teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
          fi
          exit 0
      - name: Notify on smoke failure
        # Fail-loud companion to dropping `continue-on-error: true`.
        # The Open-issue-on-failure step above handles the human-facing
        # alert; this step emits a clearly-tagged ::error:: line that
        # log-tail consumers (Loki SOPRefireRule, orchestrator triage
        # loop) can grep on. Mirrors PR#461's sweep-stale-e2e-orgs
        # pattern. Runs AFTER the teardown safety net (which is
        # if: always()) so failures don't suppress cleanup.
        if: failure()
        run: |
          echo "::error::staging-smoke FAILED — staging SaaS canary is red. See prior step logs + the auto-filed alert issue. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) MiniMax/Anthropic LLM key dead, (d) AMI/CF/WorkOS drift. The 30-min cron will retry, but a chronic red here indicates the staging SaaS stack is broken end-to-end."
          exit 1
--- a/.gitea/workflows/staging-verify.yml
+++ b/.gitea/workflows/staging-verify.yml
@ -1,289 +0,0 @@
 name: Staging verify
 # Renamed from canary-verify.yml on 2026-05-11 per Hongming directive
 # ("canary naming changed to staging for all"). Originally ported from
 # .github/workflows/canary-verify.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with
 #     push+paths filter per this PR. Gitea 1.22.6 does not support
 #     `workflow_run` (task #81). The push trigger fires on every
 #     commit to publish-workspace-server-image.yml. Removed the
 #     `workflow_run.conclusion==success` job if since the push trigger
 #     doesn't carry completion state — the smoke test is the safety net
 #     (it will detect and abort on a bad image regardless). Added
 #     workflow_dispatch for manual runs.
 #
 # Runs the canary smoke suite against the staging canary tenant fleet
 # after a new :staging-<sha> image lands in ECR. On green, calls the
 # CP redeploy-fleet endpoint to promote :staging-<sha> → :latest so
 # the prod tenant fleet's 5-minute auto-updater picks up the verified
 # digest. On red, :latest stays on the prior known-good digest and
 # prod is untouched.
 #
 # Terminology note (2026-05-11): The deployment STRATEGY here is still
 # called "canary release" (a small subset of tenants gets the new image
 # first, the rest follow on green). The "canary" word stays for the
 # pre-fan-out cohort concept (see docs/architecture/canary-release.md
 # and CANARY_SLUG in redeploy-tenants-on-*.yml). What changed is the
 # FILE NAME and the SECRETS feeding this workflow — both are renamed
 # to drop the redundant "canary-" prefix that conflated workflow
 # identity with deployment strategy.
 #
 # Registry note (2026-05-10): This workflow previously used GHCR
 # (ghcr.io/molecule-ai/platform-tenant) — that registry was retired
 # during the 2026-05-06 Gitea suspension migration when publish-
 # workspace-server-image.yml switched to the operator's ECR org
 # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/
 # platform-tenant). The GHCR → ECR migration was never applied to
 # this file, so this workflow was silently smoke-testing the stale
 # GHCR image while the actual staging/prod tenants ran the ECR image.
 # Result: smoke tests could not catch a broken ECR build. Fix:
 #   - Wait step: reads SHA from running canary /health (tenant-
 #     agnostic, works regardless of registry).
 #   - Promote step: calls CP redeploy-fleet endpoint with target_tag=
 #     staging-<sha>, same mechanism as redeploy-tenants-on-main.yml.
 #     No longer attempts GHCR crane ops.
 #
 # Dependencies:
 #   - publish-workspace-server-image.yml publishes :staging-<sha>
 #     to ECR on staging and main merges.
 #   - Canary tenants are configured to pull :staging-<sha> from ECR
 #     (TENANT_IMAGE env set to the ECR :staging-<sha> tag).
 #   - Repo secrets MOLECULE_STAGING_TENANT_URLS /
 #     MOLECULE_STAGING_ADMIN_TOKENS / MOLECULE_STAGING_CP_SHARED_SECRET
 #     are populated.
 on:
  push:
    branches: [staging]
    paths:
      - '.gitea/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:
 permissions:
  contents: read
  packages: write
  actions: read
 env:
  # ECR registry (post-2026-05-06 SSOT for tenant images).
  # publish-workspace-server-image.yml pushes here.
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
  # CP endpoint for redeploy-fleet (used in promote step below).
  CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  staging-smoke:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      sha: ${{ steps.compute.outputs.sha }}
      smoke_ran: ${{ steps.smoke.outputs.ran }}
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Compute sha
        id: compute
        run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      - name: Wait for canary tenants to pick up :staging-<sha>
        # Poll canary health endpoints every 30s for up to 7 min instead
        # of a fixed 6-min sleep. Exits as soon as ALL canaries report
        # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to
        # proceeding after 7 min even if not all canaries responded —
        # the smoke suite will catch any that didn't update.
        #
        # NOTE: The SHA is read from the running tenant's /health response,
        # NOT from a registry lookup. This is registry-agnostic and works
        # regardless of whether the tenant pulls from ECR, GHCR, or any
        # other registry — the canary is telling us what it's actually
        # running, which is the ground truth for smoke testing.
        env:
          MOLECULE_STAGING_TENANT_URLS: ${{ secrets.MOLECULE_STAGING_TENANT_URLS }}
          EXPECTED_SHA: ${{ steps.compute.outputs.sha }}
        run: |
          if [ -z "$MOLECULE_STAGING_TENANT_URLS" ]; then
            echo "No canary URLs configured — falling back to 60s wait"
            sleep 60
            exit 0
          fi
          IFS=',' read -ra URLS <<< "$MOLECULE_STAGING_TENANT_URLS"
          MAX_WAIT=420  # 7 minutes
          INTERVAL=30
          ELAPSED=0
          while [ $ELAPSED -lt $MAX_WAIT ]; do
            ALL_READY=true
            for url in "${URLS[@]}"; do
              HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}")
              SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4)
              if [ "$SHA" != "$EXPECTED_SHA" ]; then
                ALL_READY=false
                break
              fi
            done
            if $ALL_READY; then
              echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s"
              exit 0
            fi
            echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)"
            sleep $INTERVAL
            ELAPSED=$((ELAPSED + INTERVAL))
          done
          echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)"
      - name: Run staging smoke suite
        id: smoke
        # Graceful-skip when no canary fleet is configured (Phase 2 not yet
        # stood up — see molecule-controlplane/docs/canary-tenants.md).
        # Sets `ran=false` on skip so promote-to-latest stays off (we don't
        # want every main merge auto-promoting without gating). Manual
        # promote-latest.yml is the release gate while canary is absent.
        # Once the fleet is real: delete the early-exit branch.
        env:
          MOLECULE_STAGING_TENANT_URLS: ${{ secrets.MOLECULE_STAGING_TENANT_URLS }}
          MOLECULE_STAGING_ADMIN_TOKENS: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKENS }}
          MOLECULE_STAGING_CP_BASE_URL: https://staging-api.moleculesai.app
          MOLECULE_STAGING_CP_SHARED_SECRET: ${{ secrets.MOLECULE_STAGING_CP_SHARED_SECRET }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_STAGING_TENANT_URLS:-}" ] \
            || [ -z "${MOLECULE_STAGING_ADMIN_TOKENS:-}" ] \
            || [ -z "${MOLECULE_STAGING_CP_SHARED_SECRET:-}" ]; then
            {
              echo "## ⚠️ staging-verify skipped"
              echo
              echo "One or more canary secrets are unset (\`MOLECULE_STAGING_TENANT_URLS\`, \`MOLECULE_STAGING_ADMIN_TOKENS\`, \`MOLECULE_STAGING_CP_SHARED_SECRET\`)."
              echo "Phase 2 canary fleet has not been stood up yet —"
              echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
              echo
              echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "ran=false" >> "$GITHUB_OUTPUT"
            echo "::notice::staging-verify: skipped — no canary fleet configured"
            exit 0
          fi
          bash scripts/staging-smoke.sh
          echo "ran=true" >> "$GITHUB_OUTPUT"
      - name: Summary on failure
        if: ${{ failure() }}
        run: |
          {
            echo "## Canary smoke FAILED"
            echo
            echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`."
            echo ":latest stays pinned to the prior good digest — prod is untouched."
            echo
            echo "Fix forward and merge again, or investigate the specific failed"
            echo "assertions in the staging-smoke step log above."
          } >> "$GITHUB_STEP_SUMMARY"
  promote-to-latest:
    # On green, calls the CP redeploy-fleet endpoint with target_tag=
    # staging-<sha> to promote the verified ECR image. This is the same
    # mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops.
    #
    # Pre-fix history: the old GHCR promote step used `crane tag` against
    # ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server-
    # image.yml had already migrated to ECR on 2026-05-07 (commit
    # 10e510f5). The GHCR tags were never updated, so this step was
    # silently promoting a stale GHCR image while actual prod tenants
    # pulled from ECR. Canary smoke tests were GHCR-targeted and could
    # not catch a broken ECR build.
    needs: staging-smoke
    if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    env:
      SHA: ${{ needs.staging-smoke.outputs.sha }}
      CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
      # CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint.
      # Stored at the repo level so all workflows pick it up automatically.
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      # canary_slug pin: deploy the verified :staging-<sha> to the canary
      # first (soak 120s), then fan out to the rest of the fleet.
      CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }}
      SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }}
      BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }}
    steps:
      - name: Check CP credentials
        run: |
          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
            echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet."
            echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret."
            exit 1
          fi
      - name: Promote verified ECR image to :latest
        run: |
          set -euo pipefail
          TARGET_TAG="staging-${SHA}"
          BODY=$(jq -nc \
            --arg tag "$TARGET_TAG" \
            --argjson soak "${SOAK_SECONDS:-120}" \
            --argjson batch "${BATCH_SIZE:-3}" \
            --argjson dry false \
            '{
              target_tag: $tag,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          if [ -n "${CANARY_SLUG:-}" ]; then
            BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY")
          fi
          echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  target_tag: $TARGET_TAG"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          CURL_EXIT=$?
          set -e
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          if [ "$HTTP_CODE" -ge 400 ]; then
            echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed."
            exit 1
          fi
      - name: Summary
        run: |
          {
            echo "## Staging verified — :latest promoted via CP redeploy-fleet"
            echo ""
            echo "- **Target tag:** \`staging-${{ needs.staging-smoke.outputs.sha }}\`"
            echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)"
            echo "- **Canary slug:** \`${CANARY_SLUG:-<none>}\` (soak ${SOAK_SECONDS}s)"
            echo "- **Batch size:** ${BATCH_SIZE:-3}"
            echo ""
            echo "CP redeploy-fleet is rolling out the verified image across the prod fleet."
            echo "The fleet's 5-minute health-check loop will pick up the update automatically."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.gitea/workflows/status-reaper.yml
+++ b/.gitea/workflows/status-reaper.yml
@ -1,121 +0,0 @@
 # status-reaper — Option B (compensating-status POST) for Gitea 1.22.6's
 # hardcoded `(push)` suffix on default-branch commit statuses.
 #
 # Tracking: molecule-core#? (this PR), internal#327 (sibling publish-runtime-bot),
 # internal#328 (sibling mc-drift-bot), internal#80 (upstream RFC). Sister
 # bots already deployed under the same per-persona-identity contract
 # (`feedback_per_agent_gitea_identity_default`).
 #
 # Root cause:
 #   Gitea 1.22.6 emits commit-status context as
 #     `<workflow_name> / <job_name> (push)`
 #   for ANY workflow run on the default branch's HEAD commit, REGARDLESS
 #   of the trigger event. Schedule- and workflow_dispatch-triggered runs
 #   on `main` therefore appear as `(push)` failures on the latest main
 #   commit, painting main red via a fake-push status. Verified on runs
 #   14525 + 14526 via Phase 1 evidence (3 sub-agents). No upstream fix
 #   in 1.23-1.26.1 (sibling a6f20db1 research).
 #
 # Why a cron-driven reaper, not workflow_run:
 #   Gitea 1.22.6 does NOT support `on: workflow_run` (verified via
 #   modules/actions/workflows.go enumeration; sister a6f20db1). The
 #   only event-shaped option that fires is cron. 5min is chosen to
 #   sit BETWEEN ci-required-drift (`:17` hourly) and main-red-watchdog
 #   (`:05` hourly) so the reaper sweeps red before the watchdog files
 #   a `[main-red]` issue (would-be false-positive).
 #
 # What the reaper does each tick:
 #   1. Parse `.gitea/workflows/*.yml`, classify each by whether `on:`
 #      contains a `push:` trigger (see script for workflow_id resolution
 #      including `name:` collision and `/`-in-name fail-loud lints).
 #   2. GET combined status for main HEAD.
 #   3. For each `failure` status whose context ends ` (push)`:
 #      - if workflow has push trigger: PRESERVE (real defect signal).
 #      - if workflow has no push trigger: POST a compensating
 #        `state=success` with the same context and a description that
 #        documents the workaround.
 #
 # What it does NOT do:
 #   - Mutate non-`(push)`-suffix statuses (e.g. `(pull_request)` from
 #     branch_protections required-checks — verified safe 2026-05-11).
 #   - Auto-revert. Same reasoning as main-red-watchdog.
 #   - Cancel runs. The runs themselves stay visible in Actions UI; the
 #     fix is at the commit-status surface only.
 #
 # Removal path: drop this workflow when Gitea ≥ 1.24 ships with a
 # real fix for the hardcoded-suffix bug. Audit issue (filed post-merge)
 # tracks the deletion as a follow-up sweep.
 name: status-reaper
 # IMPORTANT — Gitea 1.22.6 parser quirk per
 # `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
 # `inputs:` block here. Gitea 1.22.6 rejects the whole workflow as
 # "unknown on type" when `workflow_dispatch.inputs.X` is present.
 on:
  # SCHEDULE RE-ENABLED 2026-05-12 rev3 — interim disable (mc#645) reverted now that
  # rev3 widens DEFAULT_SWEEP_LIMIT 10 → 30 (covers retroactive-failure timing window).
  # Sibling watchdog re-enabled in the same PR with timeout-minutes raised 5 → 15.
  schedule:
    # Every 5 minutes. Off-zero alignment with sibling cron workflows:
    # ci-required-drift (`:17`), main-red-watchdog (`:05`),
    # railway-pin-audit (`:23`). 5-min cadence gives a tight enough
    # close on schedule-triggered false-reds that main-red-watchdog
    # (hourly :05) almost never files an issue on the false case.
    # rev3 keeps `*/5` unchanged per hongming-pc2 03:25Z review:
    # "trades window-width-cheap for cadence-loady" — N=30 widens
    # the lookback cheaply without doubling runner load via `*/2`.
    - cron: '*/5 * * * *'
  workflow_dispatch:
 # Compensating-status POST needs write on repo statuses; no other
 # write surface is touched. checkout still needs `contents: read`.
 permissions:
  contents: read
 # NOTE: NO `concurrency:` block is intentional.
 # Gitea 1.22.6 doesn't honor `cancel-in-progress: false`: queued ticks
 # of the same group get cancelled-with-started=0 instead of waiting
 # (DB-verified 2026-05-12, runs 16053/16085 of status-reaper.yml).
 # The reaper's POST /statuses/{sha} is idempotent — Gitea de-dups by
 # context — so concurrent ticks are safe; accept them rather than
 # serialise via the broken mechanism.
 jobs:
  reap:
    runs-on: ubuntu-latest
    timeout-minutes: 3
    steps:
      - name: Check out repo at default-branch HEAD
        # BASE checkout per `feedback_pull_request_target_workflow_from_base`.
        # The script reads .gitea/workflows/*.yml from the working tree to
        # classify trigger sets; we must read main's CURRENT state, not
        # the SHA a stale schedule fired against.
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ github.event.repository.default_branch }}
      - name: Set up Python (PyYAML for workflow `on:` parse)
        # Pinned to 3.12 to match sibling watchdog / ci-required-drift.
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Install PyYAML
        # PyYAML is needed because shell-grep on `on:` misses list/string
        # forms and nested `push: { paths: ... }`. Same install pattern
        # as ci-required-drift.yml (sub-2s install, no wheel cache).
        run: python -m pip install --quiet 'PyYAML==6.0.2'
      - name: Compensate operational push-suffix failures on main
        env:
          # claude-status-reaper persona token; provisioned by sibling
          # aefaac1b 2026-05-11. Owns write:repository scope to POST
          # /statuses/{sha} but NOTHING ELSE
          # (`feedback_per_agent_gitea_identity_default`).
          GITEA_TOKEN: ${{ secrets.STATUS_REAPER_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          WATCH_BRANCH: ${{ github.event.repository.default_branch }}
          WORKFLOWS_DIR: .gitea/workflows
        run: python3 .gitea/scripts/status-reaper.py
--- a/.gitea/workflows/sweep-aws-secrets.yml
+++ b/.gitea/workflows/sweep-aws-secrets.yml
@ -1,129 +0,0 @@
 name: Sweep stale AWS Secrets Manager secrets
 # Ported from .github/workflows/sweep-aws-secrets.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for per-tenant AWS Secrets Manager secrets
 # (`molecule/tenant/<org_id>/bootstrap`) whose backing tenant no
 # longer exists. Parallel-shape to sweep-cf-tunnels.yml and
 # sweep-cf-orphans.yml — different cloud, same justification.
 #
 # Why this exists separately from a long-term reconciler integration:
 #   - molecule-controlplane's tenant_resources audit table (mig 024)
 #     currently tracks four resource kinds: CloudflareTunnel,
 #     CloudflareDNS, EC2Instance, SecurityGroup. SecretsManager is
 #     not in the list, so the existing reconciler doesn't catch
 #     orphan secrets.
 #   - At ~$0.40/secret/month the cost grew to ~$19/month before this
 #     sweeper was written, indicating ~45+ orphan secrets from
 #     crashed provisions and incomplete deprovision flows.
 #   - The proper fix (KindSecretsManagerSecret + recorder hook +
 #     reconciler enumerator) is filed as a separate controlplane
 #     issue. This sweeper is the immediate cost-relief stopgap.
 #
 # AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID /
 # AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same
 # credentials used by the rest of the platform. The dedicated
 # AWS_JANITOR_* naming (which the original GitHub workflow used) was
 # never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID /
 # AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have
 # secretsmanager:ListSecrets (the production molecule-cp principal);
 # if ListSecrets is revoked in future, a dedicated janitor principal
 # would need to be created and the Gitea secret names updated here.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
 # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
 # the mostly-orphan tunnels) refuses to nuke past the threshold.
 on:
  schedule:
    # Hourly at :30 — offsets from sweep-cf-orphans (:15) and
    # sweep-cf-tunnels (:45) so the three janitors don't burst the
    # CP admin endpoints at the same minute.
    - cron: '30 * * * *'
 # Don't let two sweeps race the same AWS account.
 concurrency:
  group: sweep-aws-secrets
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep AWS Secrets Manager
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # 30 min cap, mirroring the other janitors. AWS DeleteSecret is
    # fast (~0.3s/call) so even a 100+ backlog drains in seconds
    # under the 8-way xargs parallelism, but the cap is set generously
    # to leave headroom for any actual API hang.
    timeout-minutes: 30
    env:
      AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
      GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        id: verify
        # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
        # and sweep-cf-tunnels (hardened 2026-04-28). Same principle:
        #   - schedule → exit 1 on missing secrets (red CI surfaces it)
        #   - workflow_dispatch → exit 0 with warning (operator-driven,
        #     they already accepted the repo state)
        run: |
          missing=()
          for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do
            if [ -z "${!var:-}" ]; then
              missing+=("$var")
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
              echo "skip=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
            exit 1
          fi
          echo "All required secrets present ✓"
          echo "skip=false" >> "$GITHUB_OUTPUT"
      - name: Run sweep
        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-tunnels:
        #   - Scheduled: input empty → "false" → --execute (the whole
        #     point of an hourly janitor).
        #   - Manual workflow_dispatch: input default true → dry-run;
        #     operator must flip it to actually delete.
        run: |
          set -euo pipefail
          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
            echo "Running in dry-run mode — no deletions"
            bash scripts/ops/sweep-aws-secrets.sh
          else
            echo "Running with --execute — will delete identified orphans"
            bash scripts/ops/sweep-aws-secrets.sh --execute
          fi
--- a/.gitea/workflows/sweep-cf-orphans.yml
+++ b/.gitea/workflows/sweep-cf-orphans.yml
@ -1,156 +0,0 @@
 name: Sweep stale Cloudflare DNS records
 # Ported from .github/workflows/sweep-cf-orphans.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for Cloudflare DNS records whose backing tenant/workspace no
 # longer exists. Without this loop, every short-lived E2E or canary
 # leaves a CF record on the moleculesai.app zone — the zone has a
 # 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
 # start failing with code 81045 once exhausted.
 #
 # Why a separate workflow vs sweep-stale-e2e-orgs.yml:
 #   - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
 #     drives the cascade). It assumes CP has the org row to drive the
 #     deprovision from. It doesn't catch records left behind when CP
 #     itself never knew about the tenant (canary scratch, manual ops
 #     experiments) or when the cascade's CF-delete branch failed.
 #   - sweep-cf-orphans.sh enumerates the CF zone directly and matches
 #     each record against live CP slugs + AWS EC2 names. It catches
 #     leaks the CP-driven sweep can't.
 #
 # Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
 # than 50% of records in a single run. If something has gone weird
 # (CP admin endpoint returns no orgs → every tenant looks orphan) the
 # gate halts before damage. Decision-function unit tests in
 # scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
 # classifier.
 #
 # Secrets: CF_API_TOKEN, CF_ZONE_ID, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
 # are confirmed existing per issue #425 §425 audit. CP_ADMIN_API_TOKEN and
 # CP_STAGING_ADMIN_API_TOKEN are unconfirmed — if missing, the verify step
 # (schedule → hard-fail, dispatch → soft-skip) surfaces it clearly.
 on:
  schedule:
    # Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
    # converge on the same tick. CF API rate budget is generous (1200
    # req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
    - cron: '15 * * * *'  # offset from sweep-stale-e2e-orgs (top of hour)
  # No `merge_group:` trigger on purpose. This is a janitor — it doesn't
  # need to gate merges, and including it as written before #2088 fired
  # the full sweep job (or its secret-check) on every PR going through
  # the merge queue, generating one red CI run per merge-queue eval. If
  # this workflow is ever wired up as a required check, re-add
  #   merge_group: { types: [checks_requested] }
  # AND gate the sweep step with `if: github.event_name != 'merge_group'`
  # so merge-queue evals report success without actually running.
 # Don't let two sweeps race the same zone. workflow_dispatch during a
 # scheduled run would otherwise issue duplicate DELETE calls.
 concurrency:
  group: sweep-cf-orphans
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep CF orphans
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
    # within one cron interval instead of burning a full tick. Realistic
    # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
    # each individually capped at 10s by the script's curl -m flag.
    timeout-minutes: 3
    env:
      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
      CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      AWS_DEFAULT_REGION: us-east-2
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        id: verify
        # Schedule-vs-dispatch behaviour split (hardened 2026-04-28
        # after the silent-no-op incident below):
        #
        # The earlier soft-skip-on-schedule policy hid a real leak. All
        # six secrets were unset on this repo for an unknown duration;
        # every hourly run printed a yellow ::warning:: and exited 0,
        # so the workflow registered as "passing" while doing nothing.
        # CF orphans accumulated to 152/200 (~76% of the zone quota
        # gone) before a manual `dig`-driven audit caught it. Anything
        # that runs as a janitor and reports green while idle is
        # indistinguishable from "the janitor is healthy" — so we now
        # treat schedule (and any future workflow_run/push triggers)
        # as a hard-fail when secrets are missing.
        #
        #   - schedule / workflow_run / push → exit 1 (red CI run
        #     surfaces the misconfiguration the next tick)
        #   - workflow_dispatch              → exit 0 with a warning
        #     (an operator ran this ad-hoc; they already accepted the
        #     state of the repo and want the workflow to short-circuit
        #     so they can rerun after fixing the secret)
        run: |
          missing=()
          for var in CF_API_TOKEN CF_ZONE_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
            if [ -z "${!var:-}" ]; then
              missing+=("$var")
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
              echo "skip=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
            echo "::error::a silent skip masked an active CF DNS leak (152/200 zone records) caught only by a manual audit on 2026-04-28; this gate exists to make the gap visible."
            exit 1
          fi
          echo "All required secrets present ✓"
          echo "skip=false" >> "$GITHUB_OUTPUT"
      - name: Run sweep
        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry (intentional):
        #   - Scheduled runs: github.event.inputs.dry_run is empty →
        #     defaults to "false" below → script runs with --execute
        #     (the whole point of an hourly janitor).
        #   - Manual workflow_dispatch: input default is true (line 38)
        #     so an ad-hoc operator-triggered run is dry-run by default;
        #     they have to flip the toggle to actually delete.
        # The script's MAX_DELETE_PCT gate (default 50%) is the second
        # line of defense regardless of mode.
        run: |
          set -euo pipefail
          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
            echo "Running in dry-run mode — no deletions"
            bash scripts/ops/sweep-cf-orphans.sh
          else
            echo "Running with --execute — will delete identified orphans"
            bash scripts/ops/sweep-cf-orphans.sh --execute
          fi
--- a/.gitea/workflows/sweep-cf-tunnels.yml
+++ b/.gitea/workflows/sweep-cf-tunnels.yml
@ -1,133 +0,0 @@
 name: Sweep stale Cloudflare Tunnels
 # Ported from .github/workflows/sweep-cf-tunnels.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for Cloudflare Tunnels whose backing tenant no longer
 # exists. Parallel-shape to sweep-cf-orphans.yml (which sweeps DNS
 # records); same justification, different CF resource.
 #
 # Why this exists separately from sweep-cf-orphans:
 #   - DNS records live on the zone (`/zones/<id>/dns_records`).
 #   - Tunnels live on the account (`/accounts/<id>/cfd_tunnel`).
 #   - Different CF API surface, different scopes; the existing CF
 #     token might not have `account:cloudflare_tunnel:edit`. Splitting
 #     the workflows keeps each one's secret-presence gate independent
 #     so neither silent-skips when the other's secret is missing.
 #   - Cleaner blast radius — operators can disable one without the
 #     other if a regression surfaces.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 90% — higher than
 # the DNS sweep's 50% because tenant-shaped tunnels are mostly
 # orphans by design) refuses to nuke past the threshold.
 #
 # Secrets: CF_API_TOKEN, CF_ACCOUNT_ID are confirmed existing per
 # issue #425 §425 audit. CP_ADMIN_API_TOKEN and CP_STAGING_ADMIN_API_TOKEN
 # are unconfirmed — if missing, the verify step (schedule → hard-fail,
 # dispatch → soft-skip) surfaces it clearly.
 on:
  schedule:
    # Hourly at :45 — offset from sweep-cf-orphans (:15) so the two
    # janitors don't issue parallel CF API bursts at the same minute.
    - cron: '45 * * * *'
 # Don't let two sweeps race the same account.
 concurrency:
  group: sweep-cf-tunnels
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep CF tunnels
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # 30 min cap. Was 5 min on the theory that the only thing that
    # could take >5min is a CF-API hang — but on 2026-05-02 a backlog
    # of 672 stale tunnels accumulated (large staging E2E run + delayed
    # sweep) and the serial `curl -X DELETE` loop (~0.7s/tunnel) needed
    # ~7-8min to drain. The 5-min cap killed the run mid-sweep
    # (cancelled at 424/672, see run 25248788312); a manual rerun
    # finished the remainder fine.
    #
    # The fix is two-part: parallelize the delete loop (8-way xargs in
    # the script — see scripts/ops/sweep-cf-tunnels.sh), AND raise the
    # cap so a one-off backlog doesn't trip a hangs-detector that
    # turned out to be a real-job-too-slow detector. With 8-way
    # parallelism, 600+ tunnels drains in ~60s; 30 min is generous
    # headroom for actual hangs to still surface (and is in line with
    # the sweep-cf-orphans companion job).
    timeout-minutes: 30
    env:
      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
      CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        id: verify
        # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
        # (hardened 2026-04-28 after the silent-no-op incident: the
        # janitor reported green while doing nothing because secrets
        # were unset, masking a 152/200 zone-record leak). Same
        # principle applies here:
        #   - schedule → exit 1 on missing secrets (red CI surfaces it)
        #   - workflow_dispatch → exit 0 with warning (operator-driven,
        #     they already accepted the repo state)
        run: |
          missing=()
          for var in CF_API_TOKEN CF_ACCOUNT_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do
            if [ -z "${!var:-}" ]; then
              missing+=("$var")
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
              echo "::warning::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from the zone:dns:edit scope used by sweep-cf-orphans)."
              echo "skip=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
            echo "::error::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope."
            exit 1
          fi
          echo "All required secrets present ✓"
          echo "skip=false" >> "$GITHUB_OUTPUT"
      - name: Run sweep
        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-orphans:
        #   - Scheduled: input empty → "false" → --execute (the whole
        #     point of an hourly janitor).
        #   - Manual workflow_dispatch: input default true → dry-run;
        #     operator must flip it to actually delete.
        run: |
          set -euo pipefail
          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
            echo "Running in dry-run mode — no deletions"
            bash scripts/ops/sweep-cf-tunnels.sh
          else
            echo "Running with --execute — will delete identified orphans"
            bash scripts/ops/sweep-cf-tunnels.sh --execute
          fi
--- a/.gitea/workflows/sweep-stale-e2e-orgs.yml
+++ b/.gitea/workflows/sweep-stale-e2e-orgs.yml
@ -1,267 +0,0 @@
 name: Sweep stale e2e-* orgs (staging)
 # Ported from .github/workflows/sweep-stale-e2e-orgs.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for staging tenants left behind when E2E cleanup didn't run:
 # CI cancellations, runner crashes, transient AWS errors mid-cascade,
 # bash trap missed (signal 9), etc. Without this loop, every failed
 # teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
 # 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
 #
 # Why not rely on per-test-run teardown:
 #   - Per-run teardown is best-effort by definition. Any process death
 #     after the test starts but before the trap fires leaves debris.
 #   - GH Actions cancellation kills the runner without grace period.
 #     The workflow's `if: always()` step usually catches this, but it
 #     too can fail (CP transient 5xx, runner network issue at the
 #     wrong moment).
 #   - Even when teardown runs, the CP cascade is best-effort in places
 #     (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
 #   - This sweep is the catch-all that converges staging back to clean
 #     regardless of which specific path leaked.
 #
 # The PROPER fix is making CP cleanup transactional + verify-after-
 # terminate (filed separately as cleanup-correctness work). This
 # workflow is the safety net that catches everything else AND any
 # future leak source we haven't yet identified.
 on:
  schedule:
    # Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from
    # create to teardown — canary is ~8 min, full SaaS ~25 min). The
    # previous hourly + 120-min stale threshold meant a leaked tenant
    # could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per
    # leak. Tightening the cadence + threshold reduces the worst-case
    # leak window from 120 min to ~45 min (15-min sweep cadence + 30-min
    # threshold) without risk of catching in-progress runs (the longest
    # e2e run is the 25-min canary, well under the 30-min threshold).
    # See molecule-controlplane#420 for the leak-class accounting that
    # motivated this tightening.
    - cron: '*/15 * * * *'
 # Don't let two sweeps fight. Cron + workflow_dispatch could overlap
 # on a manual trigger; queue rather than parallel-delete.
 concurrency:
  group: sweep-stale-e2e-orgs
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep e2e orgs
    runs-on: ubuntu-latest
    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
    # 2026-05-11. The "surface broken workflows without blocking"
    # rationale was correctly applied to advisory/lint workflows but
    # wrong for this janitor — silent failure here masks real-money
    # tenant leaks. Hongming observed 15 leaked EC2 in molecule-canary
    # (004947743811) us-east-2 at 11:05Z 2026-05-11 because the sweep
    # had been exiting 2 every tick and the failure was swallowed.
    # See `feedback_strict_root_only_after_class_a` — critical janitors
    # must fail loud. A follow-up `notify-failure` step below also
    # surfaces breakage to ops even if branch-protection wiring is
    # adjusted to keep this off the required-checks list.
    timeout-minutes: 15
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }}
      DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
      # Refuse to delete more than this many orgs in one tick. If the
      # CP DB is briefly empty (or the admin endpoint goes weird and
      # returns no created_at), every e2e- org would look stale.
      # Bailing protects against runaway nukes.
      SAFETY_CAP: 50
    steps:
      - name: Verify admin token present
        run: |
          if [ -z "$ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
            exit 2
          fi
          echo "Admin token present ✓"
      - name: Identify stale e2e orgs
        id: identify
        run: |
          set -euo pipefail
          # Fetch into a file so the python step reads it via stdin —
          # cleaner than embedding $(curl ...) into a heredoc.
          curl -sS --fail-with-body --max-time 30 \
            "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
            -H "Authorization: Bearer $ADMIN_TOKEN" \
            > orgs.json
          # Filter:
          #   1. slug starts with one of the ephemeral test prefixes:
          #        - 'e2e-'    — covers e2e-smoke- (formerly e2e-canary-),
          #                      e2e-canvas-*, etc.
          #        - 'rt-e2e-' — runtime-test harness fixtures (RFC #2251);
          #                      missing this prefix left two such tenants
          #                      orphaned 8h on staging (2026-05-03), then
          #                      hard-failed redeploy-tenants-on-staging
          #                      and broke the staging→main auto-promote
          #                      chain. Kept in sync with the EPHEMERAL_PREFIX_RE
          #                      regex in redeploy-tenants-on-staging.yml.
          #   2. created_at is older than MAX_AGE_MINUTES ago
          # Output one slug per line to a file the next step reads.
          python3 > stale_slugs.txt <<'PY'
          import json, os
          from datetime import datetime, timezone, timedelta
          # SSOT for this list lives in the controlplane Go code:
          # molecule-controlplane/internal/slugs/ephemeral.go
          # (var EphemeralPrefixes). The redeploy-fleet auto-rollout
          # also reads from there to SKIP these slugs — without that
          # filter, fleet redeploy SSM-failed in-flight E2E tenants
          # whose containers were still booting, breaking the test
          # that just spun them up (molecule-controlplane#493).
          # Update both files together.
          EPHEMERAL_PREFIXES = ("e2e-", "rt-e2e-")
          with open("orgs.json") as f:
              data = json.load(f)
          max_age = int(os.environ["MAX_AGE_MINUTES"])
          cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
          for o in data.get("orgs", []):
              slug = o.get("slug", "")
              if not slug.startswith(EPHEMERAL_PREFIXES):
                  continue
              created = o.get("created_at")
              if not created:
                  # Defensively skip rows without created_at — better
                  # to leave one orphan than nuke a brand-new row
                  # whose timestamp didn't render.
                  continue
              # Python 3.11+ handles RFC3339 with Z directly via
              # fromisoformat; older runners need the trailing Z swap.
              created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
              if created_dt < cutoff:
                  print(slug)
          PY
          count=$(wc -l < stale_slugs.txt | tr -d ' ')
          echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
          if [ "$count" -gt 0 ]; then
            echo "First 20:"
            head -20 stale_slugs.txt | sed 's/^/  /'
          fi
          echo "count=$count" >> "$GITHUB_OUTPUT"
      - name: Safety gate
        if: steps.identify.outputs.count != '0'
        run: |
          count="${{ steps.identify.outputs.count }}"
          if [ "$count" -gt "$SAFETY_CAP" ]; then
            echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
            exit 1
          fi
          echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
      - name: Delete stale orgs
        if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
        run: |
          set -uo pipefail
          deleted=0
          failed=0
          while IFS= read -r slug; do
            [ -z "$slug" ] && continue
            # The DELETE handler requires {"confirm": "<slug>"} matching
            # the URL slug — fat-finger guard. Idempotent: re-issuing
            # picks up via org_purges.last_step.
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/del_resp -w "%{http_code}" \
              --max-time 60 \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/del_code
            set -e
            # Stderr from curl (-sS shows dial errors etc.) goes to runner log.
            http_code=$(cat /tmp/del_code 2>/dev/null || echo "000")
            if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
              deleted=$((deleted+1))
              echo "  deleted: $slug"
            else
              failed=$((failed+1))
              echo "  FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
            fi
          done < stale_slugs.txt
          echo ""
          echo "Sweep summary: deleted=$deleted failed=$failed"
          # Don't fail the workflow on per-org delete errors — the
          # sweeper is best-effort. Next hourly tick re-attempts. We
          # only fail loud at the safety-cap gate above.
      - name: Sweep orphan tunnels
        # Stale-org cleanup deletes the org (which cascades to tunnel
        # delete inside the CP). But when that cascade fails partway —
        # CP transient 5xx after the org row is deleted but before the
        # CF tunnel delete completes — the tunnel persists with no
        # matching org row. The reconciler in internal/sweep flags this
        # as `cf_tunnel kind=orphan`, but nothing automatically reaps it.
        #
        # `/cp/admin/orphan-tunnels/cleanup` is the operator-triggered
        # reaper. Calling it here at the end of every sweep tick
        # converges the staging CF account to clean even when CP
        # cascades half-fail.
        #
        # PR #492 made the underlying DeleteTunnel actually check
        # status — pre-fix it silent-succeeded on CF code 1022
        # ("active connections"), so this step would have been a no-op
        # against stuck connectors. Post-fix the cleanup invokes
        # CleanupTunnelConnections + retry, which actually clears the
        # 1022 case. (#2987)
        #
        # Best-effort. Failure here doesn't fail the workflow — next
        # tick re-attempts. Errors flow to step output for ops review.
        if: env.DRY_RUN != 'true'
        run: |
          set +e
          curl -sS -o /tmp/cleanup_resp -w "%{http_code}" \
            --max-time 60 \
            -X POST "$MOLECULE_CP_URL/cp/admin/orphan-tunnels/cleanup" \
            -H "Authorization: Bearer $ADMIN_TOKEN" >/tmp/cleanup_code
          set -e
          http_code=$(cat /tmp/cleanup_code 2>/dev/null || echo "000")
          body=$(cat /tmp/cleanup_resp 2>/dev/null | head -c 500)
          if [ "$http_code" = "200" ]; then
            count=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(d.get('deleted_count', 0))" 2>/dev/null || echo "0")
            failed_n=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(len(d.get('failed') or {}))" 2>/dev/null || echo "0")
            echo "Orphan-tunnel sweep: deleted=$count failed=$failed_n"
          else
            echo "::warning::orphan-tunnels cleanup returned HTTP $http_code — body: $body"
          fi
      - name: Dry-run summary
        if: env.DRY_RUN == 'true'
        run: |
          echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete."
      - name: Notify on sweep failure
        # Fail-loud companion to dropping `continue-on-error: true`.
        # If any prior step failed (missing token, CP 5xx, safety-cap
        # tripped, etc.) emit a clearly-tagged ::error:: line so the
        # Gitea runs UI + any log-tail consumer (Loki SOPRefireRule)
        # flags this. Without this step, an early `exit 2` shows as a
        # red run but the message can scroll past in busy log windows;
        # the explicit tag here is greppable from the orchestrator
        # triage loop.
        if: failure()
        run: |
          echo "::error::sweep-stale-e2e-orgs FAILED — staging tenants are LEAKING. See prior step logs. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) safety-cap tripped (CP admin API returning malformed orgs). Manual cleanup of leaked EC2 + DNS may be required while this is broken."
          exit 1
--- a/.gitea/workflows/test-ops-scripts.yml
+++ b/.gitea/workflows/test-ops-scripts.yml
@ -1,65 +0,0 @@
 name: Ops Scripts Tests
 # Ported from .github/workflows/test-ops-scripts.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group:` trigger (no Gitea merge queue).
 #   - on.paths references .gitea/workflows/test-ops-scripts.yml (this
 #     file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Runs the unittest suite for scripts/ on every PR + push that touches
 # anything under scripts/. Kept separate from the main CI so a script-only
 # change doesn't trigger the heavier Go/Canvas/Python pipelines.
 #
 # Discovery layout: tests sit alongside the code they test (see
 # scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/
 # test_build_runtime_package.py for the rewriter coverage). The job
 # below runs `unittest discover` TWICE — once from `scripts/`, once
 # from `scripts/ops/` — because neither dir has an `__init__.py`, so
 # a single discover from `scripts/` doesn't recurse into the ops
 # subdir. Two passes is simpler than retrofitting namespace packages.
 on:
  push:
    branches: [main, staging]
    paths:
      - 'scripts/**'
      - '.gitea/workflows/test-ops-scripts.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - 'scripts/**'
      - '.gitea/workflows/test-ops-scripts.yml'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test:
    name: Ops scripts (unittest)
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
      - name: Run scripts/ unittests (build_runtime_package, ...)
        # Top-level scripts/ tests live alongside their target file
        # (e.g. scripts/test_build_runtime_package.py exercises
        # scripts/build_runtime_package.py). discover from scripts/
        # picks up only top-level test_*.py because scripts/ops/ has
        # no __init__.py — that's intentional, so we run two passes.
        working-directory: scripts
        run: python -m unittest discover -t . -p 'test_*.py' -v
      - name: Run scripts/ops/ unittests (sweep_cf_decide, ...)
        working-directory: scripts/ops
        run: python -m unittest discover -p 'test_*.py' -v
--- a/.gitea/workflows/weekly-platform-go.yml
+++ b/.gitea/workflows/weekly-platform-go.yml
@ -1,120 +0,0 @@
 name: Weekly Platform-Go Surface
 # Surface latent vet/test errors on main by running the full Platform-Go
 # suite on a weekly cron regardless of whether the last push touched
 # workspace-server/.
 #
 # Background: ci.yml's `platform-build` job gates real work on
 # `if: needs.changes.outputs.platform == 'true'`. When no push touches
 # workspace-server/, the skip fires and the suite never executes on main.
 # Latent vet errors and test flakes can sit for weeks undetected.
 #
 # This workflow runs the full suite (build, vet, golangci-lint, tests with
 # coverage) every Monday at 04:17 UTC. Results are posted as commit statuses
 # but continue-on-error: true means they never block anything — they're
 # purely a noise-reduction signal for when the next workspace-server push
 # lands and would otherwise trigger the first real suite run.
 #
 # Why 04:17 UTC on Monday: off-peak, before the weekly sprint cycle starts.
 on:
  schedule:
    - cron: '17 4 * * 1'  # Mondays at 04:17 UTC
  workflow_dispatch:
 permissions:
  contents: read
  statuses: write
 jobs:
  weekly-platform-go:
    name: Weekly Platform-Go Surface
    runs-on: ubuntu-latest
    # continue-on-error: surface only, never block
    continue-on-error: true
    defaults:
      run:
        working-directory: workspace-server
    steps:
      - name: Checkout main
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          ref: main
          fetch-depth: 1
      - name: Set up Go
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: stable
      - name: Go mod download
        run: go mod download
      - name: Build
        run: go build ./cmd/server
      # `go vet` is NOT `|| true`-guarded: surfacing latent vet errors on main is
      # the whole point of this workflow (issue #567 — the motivating case was a
      # `go vet` error in org_external.go that sat undetected on main for weeks).
      # A vet error here fails the step → fails the job → shows red on the weekly
      # commit. Per Gitea quirk #10 (job-level continue-on-error is ignored), that
      # red surfaces on main — which is the intended signal, not a regression.
      - name: go vet
        run: go vet ./...
      # golangci-lint stays `|| true`-guarded: lint is noisier (more false-
      # positives than vet) and golangci-lint may not be pre-installed on every
      # runner image — a `|| true` here keeps a missing-binary or lint-noise case
      # from masking the vet/test signal above. Tighten to match ci.yml's lint
      # gate if/when ci.yml's lint step becomes hard-failing.
      - name: golangci-lint
        run: golangci-lint run --timeout 3m ./... || true
      - name: Tests with race detection + coverage
        run: go test -race -coverprofile=coverage.out ./...
      - name: Check coverage thresholds
        run: |
          set -e
          TOTAL_FLOOR=25
          CRITICAL_PATHS=(
            "internal/handlers/tokens"
            "internal/handlers/workspace_provision"
            "internal/handlers/a2a_proxy"
            "internal/handlers/registry"
            "internal/handlers/secrets"
            "internal/middleware/wsauth"
            "internal/crypto"
          )
          TOTAL=$(go tool cover -func=coverage.out | grep '^total:' | awk '{print $3}' | sed 's/%//')
          echo "Total coverage: ${TOTAL}%"
          if awk "BEGIN{exit !(\$TOTAL < \$TOTAL_FLOOR)}"; then
            echo "::error::Total coverage \${TOTAL}% is below the \${TOTAL_FLOOR}% floor."
            exit 1
          fi
          ALLOWLIST=""
          if [ -f ../.coverage-allowlist.txt ]; then
            ALLOWLIST=$(grep -vE '^(#|[[:space:]]*$)' ../.coverage-allowlist.txt || true)
          fi
          FAILED=0
          for path in "\${CRITICAL_PATHS[@]}"; do
            while read -r file pct; do
              [[ "$file" == *_test.go ]] && continue
              [[ "$file" == *"$path"* ]] || continue
              awk "BEGIN{exit !(\$pct < 10)}" || continue
              rel=$(echo "$file" | sed 's|^github.com/molecule-ai/molecule-monorepo/platform/workspace-server/||; s|^github.com/molecule-ai/molecule-monorepo/platform/||')
              if echo "$ALLOWLIST" | grep -qxF "$rel"; then
                continue
              fi
              echo "::error::Low coverage \${pct}% on \${rel} (below 10% in critical path \${path})"
              FAILED=$((FAILED + 1))
            done < <(go tool cover -func=coverage.out | grep -v '^total:' | awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++} END {for (f in s) printf "%s %.1f\n", f, s[f]/c[f]}' | sort)
          done
          if [ "$FAILED" -gt 0 ]; then
            echo "::error::\${FAILED} critical paths below 10% coverage — see above."
            exit 1
          fi
          echo "Coverage thresholds: OK"
--- a/.github/scripts/lint_secret_pattern_drift.py
+++ b/.github/scripts/lint_secret_pattern_drift.py
@ -28,7 +28,7 @@ import sys
 import urllib.request
 from pathlib import Path
-CANONICAL_FILE = Path(".gitea/workflows/secret-scan.yml")
+CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
 # Public consumer mirrors. Each entry is (label, raw_url) — raw_url
 # points at the file's RAW content on the consumer's default branch
@ -37,7 +37,7 @@ CANONICAL_FILE = Path(".gitea/workflows/secret-scan.yml")
 CONSUMERS: list[tuple[str, str]] = [
    (
        "molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh",
-        "https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime/raw/branch/main/molecule_runtime/scripts/pre-commit-checks.sh",
+        "https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh",
    ),
 ]
--- a/.github/workflows/auto-promote-on-e2e.yml
+++ b/.github/workflows/auto-promote-on-e2e.yml
@ -0,0 +1,429 @@
 name: Auto-promote :latest after main image build
 # Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-<sha>`
 # → `:latest` after either the image build or E2E completes on a `main`
 # push, gated on E2E Staging SaaS not being red for that SHA.
 #
 # Why two triggers:
 #
 #   `publish-workspace-server-image` and `e2e-staging-saas` are both
 #   paths-filtered, but with DIFFERENT path sets:
 #
 #     publish-workspace-server-image:
 #       workspace-server/**, canvas/**, manifest.json
 #
 #     e2e-staging-saas (full lifecycle):
 #       workspace-server/internal/handlers/{registry,workspace_provision,
 #       a2a_proxy}.go, workspace-server/internal/middleware/**,
 #       workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh
 #
 #   The E2E set is a strict SUBSET of the publish set. So:
 #     - canvas/** changes → publish fires, E2E does not
 #     - workspace-server/cmd/** changes → publish fires, E2E does not
 #     - workspace-server/internal/sweep/** → publish fires, E2E does not
 #
 #   The previous version triggered ONLY on E2E completion, which meant
 #   non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image
 #   but never advanced `:latest`. Result: as of 2026-04-28 this workflow
 #   had run zero times since merge despite eight main pushes — `:latest`
 #   was ~7 hours / 9 PRs behind main with no human realising. See
 #   `molecule-core` Slack discussion 2026-04-28.
 #
 #   Adding `publish-workspace-server-image` as a second trigger closes
 #   the gap: any image rebuild on main eligibly advances `:latest`.
 #
 # Why E2E remains a kill-switch (not the trigger):
 #
 #   When E2E DID run for this SHA and ended red, we abort — `:latest`
 #   stays on the prior known-good digest. When E2E didn't run (paths
 #   filtered out), we proceed: pre-merge gates already validated this
 #   SHA on staging via auto-promote-staging requiring CI + E2E Canvas +
 #   E2E API + CodeQL all green. Image content for non-E2E-paths
 #   (canvas, cmd, sweep) is exercised by those staging gates.
 #
 # Why `main` only:
 #
 #   `:latest` is what prod tenants pull. We only want SHAs that have
 #   reached main (via auto-promote-staging) to advance `:latest`.
 #   Triggering on staging would let a staging-only revert advance
 #   `:latest` to a SHA that never reaches main, breaking the "production
 #   runs what's on main" invariant.
 #
 # Idempotency:
 #
 #   When a SHA touches paths that match BOTH publish and E2E, both
 #   workflows fire and complete. Both trigger this workflow on
 #   completion → two runs race. Both retag `:staging-<sha>` →
 #   `:latest`. crane tag is idempotent (re-tagging the same digest is a
 #   no-op), so the second run is harmless. concurrency group serializes
 #   them anyway.
 on:
  workflow_run:
    workflows:
      - 'E2E Staging SaaS (full lifecycle)'
      - 'publish-workspace-server-image'
    types: [completed]
    branches: [main]
  workflow_dispatch:
    inputs:
      sha:
        description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)'
        required: false
        type: string
 permissions:
  contents: read
  packages: write
 concurrency:
  # Serialize promotes per-SHA so the publish+E2E both-fired race lands
  # cleanly. Different SHAs can promote in parallel.
  group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }}
  cancel-in-progress: false
 env:
  IMAGE_NAME: ghcr.io/molecule-ai/platform
  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
 jobs:
  promote:
    # Proceed if upstream succeeded OR manual dispatch. Upstream-failure
    # paths are filtered here; the E2E-was-red kill-switch lives in the
    # gate-check step below (covers the case where upstream is publish
    # success but E2E for the same SHA failed).
    if: |
      github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
    runs-on: ubuntu-latest
    steps:
      - name: Compute short sha
        id: sha
        run: |
          set -euo pipefail
          if [ -n "${{ github.event.inputs.sha }}" ]; then
            FULL="${{ github.event.inputs.sha }}"
          else
            FULL="${{ github.event.workflow_run.head_sha }}"
          fi
          echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT"
          echo "full=${FULL}" >> "$GITHUB_OUTPUT"
      - name: Gate — E2E Staging SaaS state for this SHA
        # When upstream IS E2E success, we know it's green (filtered by
        # the job-level `if` already). When upstream is publish, look up
        # E2E state for the same SHA. Four buckets:
        #
        #   - completed/success: E2E confirmed safe → proceed
        #   - completed/failure|cancelled|timed_out: E2E found a
        #     regression → ABORT (exit 1), `:latest` stays put
        #   - in_progress|queued|requested: E2E is RACING with publish
        #     for a runtime-touching SHA. publish typically completes
        #     ~5-10min before E2E (~10-15min). If we promote on the
        #     publish signal here, a later E2E failure can't roll back
        #     `:latest` — it'd already be wrongly advanced. So we DEFER:
        #     skip subsequent steps (proceed=false) and let E2E's own
        #     completion event re-fire this workflow, which then takes
        #     the upstream-is-E2E path. exit 0 so the run shows as
        #     success rather than a noisy fake-failure.
        #   - none/none: E2E was paths-filtered out for this SHA (the
        #     change touched canvas/cmd/sweep/etc. — paths covered by
        #     publish but not by E2E). pre-merge gates on staging
        #     already validated this SHA → proceed.
        #
        # Manual dispatch skips this check — operator override.
        id: gate
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SHA: ${{ steps.sha.outputs.full }}
          UPSTREAM_NAME: ${{ github.event.workflow_run.name }}
          EVENT_NAME: ${{ github.event_name }}
        run: |
          set -euo pipefail
          if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
            echo "proceed=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Manual dispatch — skipping E2E gate (operator override)"
            exit 0
          fi
          if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then
            echo "proceed=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied"
            exit 0
          fi
          # Upstream is publish-workspace-server-image. Check E2E state.
          # The jq filter must defend against TWO empty cases that gh
          # CLI emits indistinguishably:
          #   1. gh exits non-zero (network blip, auth issue) → handled
          #      by the `|| echo "none/none"` fallback below.
          #   2. gh exits zero but returns `[]` (no E2E run on this
          #      main SHA — the common case for canvas-only / cmd-only
          #      / sweep-only changes whose paths don't trigger E2E).
          #      Without `(.[0] // {})`, jq sees `null` and emits
          #      "null/none" — which the case statement below has no
          #      branch for, so it falls into *) → exit 1.
          # Surfaced 2026-04-30 the first time the App-token chain
          # (#2389) actually fired auto-promote-on-e2e from a publish
          # upstream — every prior run was E2E-upstream which
          # short-circuits before this gate.
          RESULT=$(gh run list \
            --repo "$REPO" \
            --workflow e2e-staging-saas.yml \
            --branch main \
            --commit "$SHA" \
            --limit 1 \
            --json status,conclusion \
            --jq '(.[0] // {}) | "\(.status // "none")/\(.conclusion // "none")"' \
            2>/dev/null || echo "none/none")
          echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT"
          case "$RESULT" in
            completed/success)
              echo "proceed=true" >> "$GITHUB_OUTPUT"
              echo "::notice::E2E green for this SHA — proceeding with promote"
              ;;
            completed/failure|completed/timed_out)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`"
                echo "\`:latest\` stays on the prior known-good digest."
                echo
                echo "If the failure was a flake, manually dispatch this workflow with the same sha to override."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
            completed/cancelled)
              # cancelled ≠ failure. Per-SHA concurrency cancels older E2E
              # runs when a newer push lands (memory:
              # feedback_concurrency_group_per_sha) — the newer SHA will
              # have its own E2E + promote chain. Treat the same as
              # in_progress: defer without aborting, let the next E2E run
              # promote when it lands.
              #
              # Caught 2026-05-05 02:03 on sha 31f9a5e — auto-promote
              # blocked the whole chain because this case fell through to
              # exit 1 instead of clean defer.
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`"
                echo "Likely per-SHA concurrency (newer push superseded this E2E run)."
                echo "The newer SHA's E2E will fire its own promote when it lands."
                echo "If you need this specific SHA promoted, manually dispatch."
              } >> "$GITHUB_STEP_SUMMARY"
              ;;
            in_progress/*|queued/*|requested/*|waiting/*|pending/*)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running"
                echo
                echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)."
                echo "Skipping retag here — E2E's own completion event will re-fire this workflow."
                echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts."
              } >> "$GITHUB_STEP_SUMMARY"
              ;;
            none/none)
              echo "proceed=true" >> "$GITHUB_OUTPUT"
              echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry"
              ;;
            *)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ❓ Auto-promote aborted — unexpected E2E state"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)"
                echo "Manual investigation needed; re-dispatch with the same sha once resolved."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
          esac
      - if: steps.gate.outputs.proceed == 'true'
        uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
      - name: GHCR login
        if: steps.gate.outputs.proceed == 'true'
        run: |
          echo "${{ secrets.GITHUB_TOKEN }}" | \
            crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
      - name: Verify :staging-<sha> exists for both images
        # Better to fail fast with a clear message than to half-tag
        # (platform retagged but platform-tenant missing → tenants pull
        # a stale image).
        if: steps.gate.outputs.proceed == 'true'
        run: |
          set -euo pipefail
          for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do
            tag="${img}:staging-${{ steps.sha.outputs.short }}"
            if ! crane manifest "$tag" >/dev/null 2>&1; then
              echo "::error::Missing tag: $tag"
              echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest."
              exit 1
            fi
            echo "  ok: $tag exists"
          done
      - name: Ancestry check — refuse to promote :latest backwards
        # #2244: workflow_run completions arrive in arbitrary order. If
        # SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E
        # completes before SHA-A's, this workflow can fire for SHA-A
        # AFTER it already promoted SHA-B → :latest goes backwards. The
        # orphan-reconciler "next run corrects it" doesn't apply: there's
        # no auto-corrective re-promote, :latest stays wrong until the
        # next main push lands.
        #
        # Detection: read current :latest's `org.opencontainers.image.revision`
        # label (set by publish-workspace-server-image.yml at build time)
        # and ask the GitHub compare API whether the candidate SHA is
        # ahead-of / identical-to / behind / diverged-from current.
        # Hard-fail on `behind` and `diverged` per the approved design —
        # silent-bypass is the class we're moving away from. Workflow
        # goes red, oncall sees it, operator decides how to recover
        # (manual dispatch with the right SHA, force-promote, etc.).
        #
        # Manual dispatch skips this check — operator override semantics
        # match the gate-check step above.
        #
        # Backward-compat: when current :latest carries no revision
        # label (legacy image pre-publish-with-label), skip-with-warning.
        # All :latest images on main are post-label as of 2026-04-29, so
        # this branch will be dead within 90 days; remove then.
        if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch'
        id: ancestry
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          TARGET_SHA: ${{ steps.sha.outputs.full }}
        run: |
          set -euo pipefail
          # Read the current :latest config and pull the revision label.
          # `crane config` returns the OCI image config blob (not the manifest);
          # labels live under `.config.Labels`. `// empty` makes jq return ""
          # rather than the literal "null" so the test below works.
          CURRENT_REVISION=$(crane config "${IMAGE_NAME}:latest" 2>/dev/null \
            | jq -r '.config.Labels["org.opencontainers.image.revision"] // empty' \
            || true)
          if [ -z "$CURRENT_REVISION" ]; then
            echo "decision=skip-no-label" >> "$GITHUB_OUTPUT"
            {
              echo "## ⚠ Ancestry check skipped — current :latest has no revision label"
              echo
              echo "Likely a legacy image built before \`org.opencontainers.image.revision\` was set."
              echo "Falling through to retag. After all \`:latest\` images are post-label (TODO 90 days), this branch is dead and should be removed."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::warning::Current :latest carries no revision label — skipping ancestry check (legacy image)"
            exit 0
          fi
          if [ "$CURRENT_REVISION" = "$TARGET_SHA" ]; then
            echo "decision=identical" >> "$GITHUB_OUTPUT"
            echo "::notice:::latest already at ${TARGET_SHA:0:7} — retag will be a no-op"
            exit 0
          fi
          # Ask GitHub which side of the merge graph TARGET_SHA sits on
          # relative to CURRENT_REVISION. Returns one of: ahead | identical
          # | behind | diverged. Network or auth errors collapse to "error"
          # via the explicit fallback so the case below always matches.
          STATUS=$(gh api \
            "repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \
            --jq '.status' 2>/dev/null || echo "error")
          echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS"
          case "$STATUS" in
            ahead)
              echo "decision=ahead" >> "$GITHUB_OUTPUT"
              echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag"
              ;;
            identical)
              echo "decision=identical" >> "$GITHUB_OUTPUT"
              echo "::notice::Target identical to :latest — retag will be a no-op"
              ;;
            behind)
              echo "decision=behind" >> "$GITHUB_OUTPUT"
              {
                echo "## ❌ Auto-promote refused — target is BEHIND current :latest"
                echo
                echo "| Field | Value |"
                echo "|---|---|"
                echo "| Target SHA | \`$TARGET_SHA\` |"
                echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
                echo "| GitHub compare status | \`behind\` |"
                echo
                echo "This guard catches the workflow_run-completion-order race (#2244):"
                echo "two rapid main pushes whose E2Es complete out-of-order can otherwise"
                echo "promote \`:latest\` backwards. \`:latest\` stays on \`${CURRENT_REVISION:0:7}\`."
                echo
                echo "**Recovery:** if this is a legitimate revert that should land on \`:latest\`,"
                echo "manually dispatch this workflow with the target sha as input — the manual-dispatch"
                echo "path skips the ancestry check (operator override)."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
            diverged)
              echo "decision=diverged" >> "$GITHUB_OUTPUT"
              {
                echo "## ❓ Auto-promote refused — history diverged"
                echo
                echo "| Field | Value |"
                echo "|---|---|"
                echo "| Target SHA | \`$TARGET_SHA\` |"
                echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
                echo "| GitHub compare status | \`diverged\` |"
                echo
                echo "Likely cause: force-push rewrote main's history, leaving the previous"
                echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
            error|*)
              echo "decision=error" >> "$GITHUB_OUTPUT"
              {
                echo "## ❌ Auto-promote aborted — ancestry-check API error"
                echo
                echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`"
                echo
                echo "Manual dispatch with the target sha bypasses this check."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
          esac
      - name: Retag platform :staging-<sha> → :latest
        if: steps.gate.outputs.proceed == 'true'
        run: |
          crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
      - name: Retag tenant :staging-<sha> → :latest
        if: steps.gate.outputs.proceed == 'true'
        run: |
          crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
      - name: Summary
        if: steps.gate.outputs.proceed == 'true'
        run: |
          {
            echo "## :latest promoted to ${{ steps.sha.outputs.short }}"
            echo
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "- Trigger: manual dispatch"
            else
              echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))"
            fi
            echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest"
            echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest"
            echo
            echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true."
            echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/auto-promote-staging.yml
+++ b/.github/workflows/auto-promote-staging.yml
@ -0,0 +1,434 @@
 name: Auto-promote staging → main
 # Fires after any of the staging-branch quality gates complete. When ALL
 # required gates are green on the same staging SHA, opens (or re-uses)
 # a PR `staging → main` and enables auto-merge so the merge queue lands
 # it. Closes the gap that historically let features sit on staging for
 # weeks waiting for a bulk promotion PR (see molecule-core#1496 for the
 # 1172-commit example).
 #
 # 2026-04-28 rewrite (PR #142): the previous version did a direct
 # `git merge --ff-only origin staging && git push origin main`. That
 # breaks against main's branch-protection ruleset, which requires
 # status checks "set by the expected GitHub apps" — direct pushes
 # can't satisfy that condition (only PR merges through the queue can).
 # The workflow was failing every tick with:
 #   remote: error: GH006: Protected branch update failed for refs/heads/main.
 #   remote: - Required status checks ... were not set by the expected GitHub apps.
 # Fix: mirror the PR-based pattern from auto-sync-main-to-staging.yml
 # (the reverse-direction sync, fixed in #2234 for the same reason).
 # Both directions now use the same merge-queue path that humans use,
 # no special-case bypass.
 #
 # Safety model:
 # - Runs ONLY on workflow_run events for the staging branch.
 # - Requires EVERY named gate workflow to have the same head_sha and
 #   all be `conclusion == success`. If any of them is red, skipped,
 #   cancelled, or pending, we abort (stay on the current main).
 # - The PR base=main head=staging path lets GitHub itself enforce
 #   branch protection. If main has diverged from staging or required
 #   checks aren't satisfied, the merge queue declines the PR — no
 #   need for a manual ff-only ancestry check here.
 # - Loop safety: the auto-sync-main-to-staging workflow fires when
 #   main lands the auto-promote PR, but its merge into staging is by
 #   GITHUB_TOKEN which doesn't trigger downstream workflow_run events
 #   (GitHub Actions safety). So this workflow doesn't re-fire from
 #   its own promote landing.
 #
 # Toggle via repo variable AUTO_PROMOTE_ENABLED (true/unset). When
 # unset, the workflow logs what it would have done but doesn't open
 # the PR — useful for dry-running the gate logic without surfacing
 # a noisy PR while staging CI is still flaky.
 #
 # **One-time repo setting (load-bearing):** this workflow opens the
 # staging→main PR via `gh pr create` using the default GITHUB_TOKEN.
 # Since GitHub's 2022 default change, that token cannot create or
 # approve PRs unless the repo opts in. The toggle is at:
 #
 #   Settings → Actions → General → Workflow permissions
 #   → ✅ Allow GitHub Actions to create and approve pull requests
 #
 # Without it, every workflow_run fails with:
 #
 #   pull request create failed: GraphQL: GitHub Actions is not
 #   permitted to create or approve pull requests (createPullRequest)
 #
 # Observed 2026-04-29 01:43 UTC blocking promotion of fcd87b9 (PRs
 # #2248 + #2249); manually bridged via PR #2252. Re-check this
 # setting if auto-promote starts failing with createPullRequest
 # errors after a repo or org admin change.
 on:
  workflow_run:
    workflows:
      - CI
      - E2E Staging Canvas (Playwright)
      - E2E API Smoke Test
      - CodeQL
    types: [completed]
  workflow_dispatch:
    inputs:
      force:
        description: "Force promote even when AUTO_PROMOTE_ENABLED is unset (manual override)"
        required: false
        default: "false"
 permissions:
  contents: write
  pull-requests: write
  # actions: write is needed by the post-merge dispatch tail step
  # (#2358 / #2357) — `gh workflow run publish-workspace-server-image.yml`
  # POSTs to /actions/workflows/.../dispatches which requires this scope.
  # Without it the call 403s and the publish/canary/redeploy chain still
  # doesn't run on staging→main promotions, undoing #2358.
  actions: write
 # Serialize auto-promote runs. Multiple staging gate completions can land
 # in quick succession (CI + E2E + CodeQL all finish within seconds of
 # each other on a green PR) — without this, two parallel runs both:
 #   1. Open / re-use the same promote PR.
 #   2. Both call `gh pr merge --auto` (idempotent — fine).
 #   3. Both poll for the same mergedAt and both `gh workflow run` publish
 #      → 2× redundant publish builds racing for the same `:staging-latest`
 #      retag, and 2× canary-verify chains.
 # cancel-in-progress: false because we don't want a brand-new run to kill
 # a polling-tail that's about to dispatch — the polling tail's 30 min cap
 # is the right backstop, not workflow-level cancel.
 concurrency:
  group: auto-promote-staging
  cancel-in-progress: false
 jobs:
  check-all-gates-green:
    # Only consider staging pushes. PRs into staging don't promote.
    if: >
      (github.event_name == 'workflow_run' &&
       github.event.workflow_run.head_branch == 'staging' &&
       github.event.workflow_run.event == 'push')
      || github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      all_green: ${{ steps.gates.outputs.all_green }}
      head_sha: ${{ steps.gates.outputs.head_sha }}
    steps:
      # Skip empty-tree promotes (the perpetual auto-promote↔auto-sync cycle
      # observed 2026-05-03). Sequence: auto-promote merges via the staging
      # merge-queue's MERGE strategy, creating a merge commit on main that
      # staging doesn't have. auto-sync then merges main back into staging
      # via another merge commit (the queue's MERGE strategy applies on
      # the staging side too, even when the workflow's local FF would
      # have sufficed). Now staging has a new merge-commit SHA whose
      # tree == main's tree — but auto-promote sees "staging ahead of
      # main by 1" and opens YET another empty promote PR. Each round
      # costs ~30-40 min wallclock, ~2 manual approvals, and burns a
      # full CodeQL Go run (~15 min). Without this guard the cycle
      # repeats indefinitely.
      #
      # Long-term fix is to switch the merge_queue ruleset's
      # `merge_method` away from MERGE so FF-able PRs land cleanly,
      # but that's a broader change affecting every staging PR's
      # commit shape. This guard is the one-line surgical fix that
      # breaks the cycle without touching merge-queue config.
      #
      # Fail-open: if `git diff` errors for any reason, fall through
      # to the gate check (preserve existing behavior). Only skip
      # when the diff is DEFINITIVELY empty.
      - name: Checkout for tree-diff check
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          ref: staging
      - name: Skip if staging tree == main tree (perpetual-cycle break)
        id: tree-diff
        env:
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
        run: |
          set -eu
          git fetch origin main --depth=50 || { echo "::warning::git fetch main failed — proceeding (fail-open)"; exit 0; }
          # Compare staging tip's tree against main's tree. `git diff
          # --quiet` exits 0 if no differences, 1 if there are.
          if git diff --quiet origin/main "$HEAD_SHA" -- 2>/dev/null; then
            {
              echo "## ⏭ Skipped — no code to promote"
              echo
              echo "staging tip (\`${HEAD_SHA:0:8}\`) and \`main\` have identical trees."
              echo "This is the auto-promote↔auto-sync merge-commit cycle: staging has a"
              echo "new SHA (a sync-back merge commit) but the underlying file tree is"
              echo "already on main, so there's no real code to ship."
              echo
              echo "Skipping to avoid opening an empty promote PR. Cycle terminates here."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::notice::auto-promote: staging tree == main tree — no code to promote, skipping"
            echo "skip=true" >> "$GITHUB_OUTPUT"
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi
      - name: Check all required gates on this SHA
        if: steps.tree-diff.outputs.skip != 'true'
        id: gates
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          REPO: ${{ github.repository }}
        run: |
          set -euo pipefail
          # Required gate workflow files. Use file paths (relative to
          # .github/workflows/) rather than display names because:
          #
          #   1. `gh run list --workflow=<name>` is ambiguous when two
          #      workflows have the same `name:` — observed 2026-04-28
          #      with "CodeQL" matching both `codeql.yml` (explicit) and
          #      GitHub's UI-configured Code-quality default setup
          #      (internal "codeql"). gh CLI returns "could not resolve
          #      to a unique workflow" → empty result → gate evaluated
          #      as missing/none → auto-promote dead-locked despite all
          #      checks actually passing.
          #
          #   2. File paths are the unique identifier for workflows;
          #      `name:` is just a display string and can collide.
          #
          # When adding/removing a gate, update this list AND the
          # branch-protection required-checks list (which uses check-run
          # display names, not workflow names; the two are decoupled and
          # should be kept in sync manually).
          GATES=(
            "ci.yml"
            "e2e-staging-canvas.yml"
            "e2e-api.yml"
            "codeql.yml"
          )
          echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT"
          echo "Checking gates on SHA ${HEAD_SHA}"
          ALL_GREEN=true
          for gate in "${GATES[@]}"; do
            # Query the most recent run of this workflow on this SHA.
            # event=push to avoid picking up PR runs. branch=staging to
            # guard against someone dispatching the gate on a non-staging
            # branch at the same SHA.
            RESULT=$(gh run list \
              --repo "$REPO" \
              --workflow "$gate" \
              --branch staging \
              --event push \
              --commit "$HEAD_SHA" \
              --limit 1 \
              --json status,conclusion \
              --jq '.[0] | "\(.status)/\(.conclusion // "none")"' \
              2>/dev/null || echo "missing/none")
            echo "  $gate → $RESULT"
            # Only completed/success counts. completed/failure or
            # in_progress/anything or no record at all = abort.
            if [ "$RESULT" != "completed/success" ]; then
              ALL_GREEN=false
            fi
          done
          echo "all_green=${ALL_GREEN}" >> "$GITHUB_OUTPUT"
          if [ "$ALL_GREEN" != "true" ]; then
            echo "::notice::auto-promote: not all gates are green on ${HEAD_SHA} — staying on current main"
          fi
  promote:
    needs: check-all-gates-green
    if: needs.check-all-gates-green.outputs.all_green == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Check rollout gate
        env:
          AUTO_PROMOTE_ENABLED: ${{ vars.AUTO_PROMOTE_ENABLED }}
          FORCE_INPUT: ${{ github.event.inputs.force }}
        run: |
          set -eu
          # Repo variable AUTO_PROMOTE_ENABLED=true flips this on. While
          # it's unset, the workflow dry-runs (logs what it would have
          # done) but doesn't open the promote PR. Set the variable in
          # Settings → Secrets and variables → Actions → Variables.
          if [ "${AUTO_PROMOTE_ENABLED:-}" != "true" ] && [ "${FORCE_INPUT:-false}" != "true" ]; then
            {
              echo "## ⏸ Auto-promote disabled"
              echo
              echo "Repo variable \`AUTO_PROMOTE_ENABLED\` is not set to \`true\`."
              echo "All gates are green on staging; would have opened a promote PR to \`main\`."
              echo
              echo "To enable: Settings → Secrets and variables → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`."
              echo "To test once manually: workflow_dispatch with \`force=true\`."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::notice::auto-promote disabled — dry run only"
            exit 0
          fi
      # Mint the App token BEFORE the promote-PR step so the auto-merge
      # call can use it. GITHUB_TOKEN-initiated merges suppress the
      # downstream `push` event on main, breaking the
      # publish-workspace-server-image → canary-verify → redeploy-tenants
      # chain (issue #2357). Using the App token here means the
      # merge-queue-landed merge IS able to fire the cascade naturally;
      # the polling tail below stays as defense-in-depth.
      - name: Mint App token for promote-PR + downstream dispatch
        if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
        id: app-token
        uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1
        with:
          app-id: ${{ secrets.MOLECULE_AI_APP_ID }}
          private-key: ${{ secrets.MOLECULE_AI_APP_PRIVATE_KEY }}
      - name: Open (or reuse) staging → main promote PR + enable auto-merge
        if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          REPO: ${{ github.repository }}
          TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }}
        run: |
          set -euo pipefail
          # Look for an existing open promote PR (idempotent on re-run
          # of the workflow). The PR's head IS the staging branch — the
          # whole point is "advance main to staging's tip", so we don't
          # need a per-SHA branch like auto-sync-main-to-staging uses.
          PR_NUM=$(gh pr list --repo "$REPO" \
            --base main --head staging --state open \
            --json number --jq '.[0].number // ""')
          if [ -z "$PR_NUM" ]; then
            TITLE="staging → main: auto-promote ${TARGET_SHA:0:7}"
            BODY_FILE=$(mktemp)
            cat > "$BODY_FILE" <<EOFBODY
          Automated promotion of \`staging\` (\`${TARGET_SHA:0:8}\`) to \`main\`. All required staging gates green at this SHA: CI, E2E Staging Canvas, E2E API Smoke, CodeQL.
          This PR is auto-generated by \`.github/workflows/auto-promote-staging.yml\` whenever every required gate completes green on the same staging SHA. It exists because main's branch protection requires status checks "set by the expected GitHub apps" — direct \`git push\` from a workflow can't satisfy that, only PR merges through the queue can.
          Merge queue lands this; no human action needed unless gates fail. Reverse-direction sync (the merge commit on main → staging) is handled by \`auto-sync-main-to-staging.yml\`.
          EOFBODY
            PR_URL=$(gh pr create --repo "$REPO" \
              --base main --head staging \
              --title "$TITLE" \
              --body-file "$BODY_FILE")
            PR_NUM=$(echo "$PR_URL" | grep -oE '[0-9]+$' | tail -1)
            rm -f "$BODY_FILE"
            echo "::notice::Opened PR #${PR_NUM}"
          else
            echo "::notice::Re-using existing promote PR #${PR_NUM}"
          fi
          # Enable auto-merge — the merge queue picks it up once
          # required gates are green on the merge_group ref.
          if ! gh pr merge "$PR_NUM" --repo "$REPO" --auto --merge 2>&1; then
            echo "::warning::Failed to enable auto-merge on PR #${PR_NUM} — operator may need to merge manually."
          fi
          {
            echo "## ✅ Auto-promote PR opened"
            echo
            echo "- Source: staging at \`${TARGET_SHA:0:8}\`"
            echo "- PR: #${PR_NUM}"
            echo
            echo "Merge queue lands the PR once required gates are green; no human action needed unless gates fail."
          } >> "$GITHUB_STEP_SUMMARY"
          # Hand the PR number to the next step so we can dispatch the
          # tenant-redeploy chain after the merge queue lands the merge.
          echo "promote_pr_num=${PR_NUM}" >> "$GITHUB_OUTPUT"
        id: promote_pr
      # The App token minted above (before the promote-PR step) is
      # also used by the polling tail below. Defense-in-depth: with
      # the merge-queue-landed merge now using the App token, the
      # main-branch push event SHOULD fire the publish/canary/redeploy
      # cascade naturally — but if for any reason it doesn't (e.g. an
      # unrelated event-suppression edge case), the explicit dispatches
      # below still wake the chain.
      - name: Wait for promote merge, then dispatch publish + redeploy (#2357)
        # Defense-in-depth dispatch. With the auto-merge call above
        # now using the App token (this commit), the merge-queue-landed
        # merge SHOULD fire publish-workspace-server-image naturally
        # via on:push:[main] — App-token-initiated pushes DO trigger
        # workflow_run cascades, unlike GITHUB_TOKEN-initiated ones
        # (the documented "no recursion" rule —
        # https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
        #
        # This explicit dispatch stays as belt-and-suspenders for any
        # edge case where the natural cascade misfires. If it never
        # observably fires after this token swap (i.e. the publish
        # workflow has already started by the time we get here), the
        # second dispatch is a harmless no-op (publish-workspace-server-image
        # has its own concurrency group that dedupes).
        #
        # See PR for #2357: pre-fix the merge action was via
        # GITHUB_TOKEN, suppressing the cascade and forcing this tail
        # to be the SOLE chain trigger. With the auto-merge token swap
        # the tail becomes redundant in the happy path; keep until
        # we've observed >=10 successful natural cascades, then drop.
        if: steps.promote_pr.outputs.promote_pr_num != ''
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          REPO: ${{ github.repository }}
          PR_NUM: ${{ steps.promote_pr.outputs.promote_pr_num }}
        run: |
          # Poll for merge — max 30 min (60 × 30s). The merge queue
          # typically lands within 5-10 min when gates are green. Break
          # early if the PR is closed without merging (operator action,
          # gates flipped red post-approval, branch-protection rejection)
          # so we don't tie up a runner for the full 30 min on a dead PR.
          MERGED=""
          STATE=""
          for _ in $(seq 1 60); do
            VIEW=$(gh pr view "$PR_NUM" --repo "$REPO" --json mergedAt,state)
            MERGED=$(echo "$VIEW" | jq -r '.mergedAt // ""')
            STATE=$(echo "$VIEW" | jq -r '.state // ""')
            if [ -n "$MERGED" ] && [ "$MERGED" != "null" ]; then
              echo "::notice::Promote PR #${PR_NUM} merged at ${MERGED}"
              break
            fi
            if [ "$STATE" = "CLOSED" ]; then
              echo "::warning::Promote PR #${PR_NUM} was closed without merging — skipping deploy dispatch."
              exit 0
            fi
            sleep 30
          done
          if [ -z "$MERGED" ] || [ "$MERGED" = "null" ]; then
            echo "::warning::Promote PR #${PR_NUM} didn't merge within 30min — skipping deploy dispatch (manually run \`gh workflow run publish-workspace-server-image.yml --ref main\` once it lands)."
            exit 0
          fi
          # Dispatch publish on main using the App token. App-initiated
          # workflow_dispatch DOES propagate the workflow_run cascade,
          # unlike GITHUB_TOKEN-initiated dispatch.
          # publish completes → canary-verify chains via workflow_run →
          # redeploy-tenants-on-main chains via workflow_run + branches:[main].
          if gh workflow run publish-workspace-server-image.yml \
              --repo "$REPO" --ref main 2>&1; then
            echo "::notice::Dispatched publish-workspace-server-image on ref=main as molecule-ai App — canary-verify and redeploy-tenants-on-main will chain via workflow_run."
            {
              echo "## 🚀 Tenant redeploy chain dispatched"
              echo
              echo "- publish-workspace-server-image (workflow_dispatch on \`main\`, actor: \`molecule-ai[bot]\`)"
              echo "- canary-verify will chain on completion"
              echo "- redeploy-tenants-on-main will chain on canary green"
            } >> "$GITHUB_STEP_SUMMARY"
          else
            echo "::error::Failed to dispatch publish-workspace-server-image. Run manually: gh workflow run publish-workspace-server-image.yml --ref main"
          fi
          # ALSO dispatch auto-sync-main-to-staging.yml. Same root cause as
          # publish above (issue #2357): the merge-queue-initiated push to
          # main is by GITHUB_TOKEN → no `on: push` triggers fire downstream.
          # Without this dispatch, every staging→main promote leaves staging
          # one merge commit BEHIND main, which silently dead-locks the NEXT
          # promote PR as `mergeStateStatus: BEHIND` because main's
          # branch-protection has `strict: true`. Verified empirically on
          # 2026-05-02 against PR #2442 (Phase 2 promote): only the explicit
          # publish-workspace-server-image dispatch fired on the previous
          # promote SHA 76c604fb, while auto-sync silently no-op'd, leaving
          # staging behind for ~24h until manually bridged.
          if gh workflow run auto-sync-main-to-staging.yml \
              --repo "$REPO" --ref main 2>&1; then
            echo "::notice::Dispatched auto-sync-main-to-staging on ref=main as molecule-ai App — staging will absorb the new main merge commit via PR + merge queue."
          else
            echo "::error::Failed to dispatch auto-sync-main-to-staging. Run manually: gh workflow run auto-sync-main-to-staging.yml --ref main"
          fi
--- a/.github/workflows/auto-promote-stale-alarm.yml
+++ b/.github/workflows/auto-promote-stale-alarm.yml
@ -0,0 +1,83 @@
 name: auto-promote-stale-alarm
 # Hourly cron + on-demand alarm for the silent-block failure mode that
 # motivated issue #2975:
 #   - The auto-promote-staging.yml workflow opened a PR + armed
 #     auto-merge, but main's branch protection requires a human review
 #     (reviewDecision=REVIEW_REQUIRED). The PR sat BLOCKED with no
 #     surface-up-the-stack for 12+ hours, holding 25 commits hostage
 #     including the Memory v2 redesign and a reno-stars data-loss fix.
 #
 # This workflow runs `scripts/check-stale-promote-pr.sh` against the
 # repo's open auto-promote PRs (base=main head=staging). When a PR has
 # been BLOCKED on REVIEW_REQUIRED for >4h, it:
 #   1. Emits a workflow-level warning (visible in run summary + the
 #      Actions UI feed).
 #   2. Posts a comment on the PR (idempotent — one alarm per PR).
 #
 # The detection logic lives in scripts/check-stale-promote-pr.sh so
 # it's unit-testable with stubbed `gh` (see test-check-stale-promote-pr.sh).
 # This file is the schedule + invocation surface only — SSOT for the
 # detector itself.
 on:
  schedule:
    # Hourly. Cheap (one `gh pr list` + jq), and 1h granularity is
    # plenty for a 4h staleness threshold — operators see the alarm
    # within at most 1h of crossing the threshold.
    - cron: "27 * * * *"  # at :27 to dodge the cron herd at :00
  workflow_dispatch:
    inputs:
      stale_hours:
        description: "Hours after which a BLOCKED+REVIEW_REQUIRED PR is stale (default 4)"
        required: false
        default: "4"
      post_comment:
        description: "Post a comment on stale PRs (default true)"
        required: false
        default: "true"
 permissions:
  contents: read
  pull-requests: write  # post comments on stale PRs
 # Serialize so the on-demand and scheduled runs don't double-comment
 # the same PR. cancel-in-progress=false because the script is idempotent
 # (existing comment marker prevents dupes), but a scheduled run firing
 # while a manual one runs would just re-list the same PR set.
 concurrency:
  group: auto-promote-stale-alarm
  cancel-in-progress: false
 jobs:
  scan:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout (need scripts/ only)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          sparse-checkout: |
            scripts/check-stale-promote-pr.sh
          sparse-checkout-cone-mode: false
      - name: Run stale-PR detector
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GITHUB_REPOSITORY: ${{ github.repository }}
          STALE_HOURS: ${{ inputs.stale_hours || '4' }}
          POST_COMMENT: ${{ inputs.post_comment || 'true' }}
        run: |
          # The script's exit code reflects the count of stale PRs.
          # We don't want a stale finding to fail the workflow run —
          # the warning + comment are the signal, the green/red is
          # noise. So convert any non-zero exit to a workflow notice
          # and exit 0.
          set +e
          bash scripts/check-stale-promote-pr.sh
          rc=$?
          set -e
          if [ "$rc" -ne 0 ]; then
            echo "::notice::Stale PR detector found $rc PR(s) needing attention. See warnings above + comments on the PRs."
          fi
          # Always succeed — operator-facing surface is the warning,
          # not the workflow status.
          exit 0
--- a/.github/workflows/auto-sync-main-to-staging.yml
+++ b/.github/workflows/auto-sync-main-to-staging.yml
@ -0,0 +1,237 @@
 name: Auto-sync main → staging
 # Reflects every push to `main` back onto `staging` so the
 # staging-as-superset-of-main invariant holds.
 #
 # Background:
 #
 # `auto-promote-staging.yml` advances main via `git merge --ff-only`
 # + `git push origin main` — that's a clean fast-forward, no merge
 # commit. But manual merges of `staging → main` PRs through the
 # GitHub UI / API create a merge commit on main that staging
 # doesn't have. The next `staging → main` PR then evaluates as
 # "BEHIND" because staging is missing that merge commit, requiring
 # a manual `gh pr update-branch` round-trip.
 #
 # This happened twice on 2026-04-28 (PRs #2202, #2205, both manual
 # bridges). Each time the bridge needed update-branch + a re-CI
 # round before merging. Operationally annoying and avoidable.
 #
 # Architecture:
 #
 # This repo's `staging` branch is protected by a `merge_queue`
 # ruleset (id 15500102) that blocks ALL direct pushes — no bypass
 # even for org admins or the GitHub Actions integration. Direct
 # `git push origin staging` returns GH013. So instead of pushing
 # directly, this workflow:
 #
 #   1. Checks if main is already in staging's ancestry → no-op.
 #   2. Creates an `auto-sync/main-<sha>` branch from staging.
 #   3. Tries `git merge --ff-only origin/main` → if staging hasn't
 #      diverged this is a clean ff.
 #   4. Otherwise `git merge --no-ff origin/main` to absorb main's
 #      tip while keeping staging's history.
 #   5. Pushes the auto-sync branch.
 #   6. Opens a PR (base=staging, head=auto-sync/main-<sha>) and
 #      enables auto-merge so the merge queue lands it.
 #
 # This mirrors the path human PRs take through staging — same
 # rules, same gates, no special-case bypass.
 #
 # Loop safety:
 #
 # `GITHUB_TOKEN`-authored merges (including the merge queue's land
 # of the auto-sync PR) do NOT trigger downstream workflow runs
 # (GitHub Actions safety). So when the auto-sync PR lands on
 # staging, `auto-promote-staging.yml` is NOT triggered by that
 # push. The next developer push to staging triggers auto-promote
 # normally. No loop possible.
 #
 # Concurrency:
 #
 # Two pushes to main in quick succession (e.g., manual UI merge
 # immediately followed by auto-promote-staging's ff-merge) could
 # otherwise open two overlapping auto-sync PRs. The concurrency
 # group serializes runs; the second waits for the first to exit.
 # (The first run exits after opening + auto-merge-queueing the PR,
 # not after the merge actually completes — so multiple PRs can be
 # open simultaneously, but the merge queue handles them serially.)
 on:
  push:
    branches: [main]
  # workflow_dispatch lets:
  #   1. Operators manually backfill a missed sync (e.g. after a manual
  #      UI merge that the runner missed).
  #   2. auto-promote-staging.yml's polling tail explicitly invoke us
  #      after the promote PR lands. This is load-bearing: when the
  #      merge queue lands a promote-PR merge, the resulting push to
  #      `main` is "by GITHUB_TOKEN", and per GitHub's no-recursion
  #      rule (https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow)
  #      that push event does NOT fire any downstream workflows. The
  #      `on: push` trigger above is silently dead for the very pattern
  #      we exist to handle. Verified empirically 2026-05-02 against
  #      SHA 76c604fb (PR #2437 staging→main): only ONE workflow fired
  #      (publish-workspace-server-image, dispatched explicitly by
  #      auto-promote's polling tail with an App token). Every other
  #      `on: push: branches: [main]` workflow — including this one —
  #      was suppressed. Until the underlying merge call moves to an
  #      App token, an explicit dispatch is the only reliable path.
  workflow_dispatch:
 permissions:
  contents: write
  pull-requests: write
 concurrency:
  group: auto-sync-main-to-staging
  cancel-in-progress: false
 jobs:
  sync-staging:
    # ubuntu-latest matches every other workflow in this repo. The
    # earlier `[self-hosted, macos, arm64]` was a copy-paste artefact
    # from the molecule-controlplane repo (which IS private and uses a
    # Mac runner) — molecule-core has no Mac runner registered, so the
    # job sat unassigned whenever the trigger fired. Verified 2026-05-02:
    # this is the ONLY workflow in molecule-core/.github/workflows/ with
    # a non-ubuntu runs-on.
    runs-on: ubuntu-latest
    steps:
      - name: Checkout staging
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          ref: staging
          token: ${{ secrets.GITHUB_TOKEN }}
      - name: Configure git author
        run: |
          git config user.name "github-actions[bot]"
          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
      - name: Check if staging already contains main
        id: check
        run: |
          set -euo pipefail
          git fetch origin main
          if git merge-base --is-ancestor origin/main HEAD; then
            echo "needs_sync=false" >> "$GITHUB_OUTPUT"
            {
              echo "## ✅ No-op"
              echo
              echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))."
            } >> "$GITHUB_STEP_SUMMARY"
          else
            echo "needs_sync=true" >> "$GITHUB_OUTPUT"
            MAIN_SHORT=$(git rev-parse --short=8 origin/main)
            echo "main_short=${MAIN_SHORT}" >> "$GITHUB_OUTPUT"
            echo "branch=auto-sync/main-${MAIN_SHORT}" >> "$GITHUB_OUTPUT"
            echo "::notice::staging is missing main's tip (${MAIN_SHORT}) — opening sync PR"
          fi
      - name: Create auto-sync branch + merge main
        if: steps.check.outputs.needs_sync == 'true'
        id: prep
        run: |
          set -euo pipefail
          BRANCH="${{ steps.check.outputs.branch }}"
          # If a previous auto-sync run already opened a branch for the
          # same main sha, prefer reusing it (idempotent behavior on
          # workflow restart). Force-update from latest staging anyway
          # so it absorbs any staging-side commits that landed since.
          git checkout -B "$BRANCH"
          if git merge --ff-only origin/main; then
            echo "did_ff=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Fast-forwarded ${BRANCH} to origin/main"
          else
            echo "did_ff=false" >> "$GITHUB_OUTPUT"
            if ! git merge --no-ff origin/main -m "chore: sync main → staging (auto)"; then
              # Hygiene: leave the work tree clean before failing.
              git merge --abort || true
              {
                echo "## ❌ Conflict"
                echo
                echo "Auto-merge \`main → staging\` failed with conflicts."
                echo "A human needs to resolve manually."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
            fi
          fi
      - name: Push auto-sync branch
        if: steps.check.outputs.needs_sync == 'true'
        run: |
          set -euo pipefail
          # Force-with-lease so a concurrent auto-sync run can't
          # silently clobber an in-flight branch we just updated. If a
          # different writer touched the branch, we abort and the next
          # run picks up the latest state.
          git push --force-with-lease origin "${{ steps.check.outputs.branch }}"
      - name: Open auto-sync PR + enable auto-merge
        if: steps.check.outputs.needs_sync == 'true'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          BRANCH: ${{ steps.check.outputs.branch }}
          MAIN_SHORT: ${{ steps.check.outputs.main_short }}
          DID_FF: ${{ steps.prep.outputs.did_ff }}
        run: |
          set -euo pipefail
          # Find existing PR for this branch (idempotent on workflow
          # restart) before creating a new one.
          PR_NUM=$(gh pr list --head "$BRANCH" --base staging --state open --json number --jq '.[0].number // ""')
          if [ -z "$PR_NUM" ]; then
            # Body lives in a temp file to keep the multi-line content
            # out of the YAML block scalar (un-indented newlines inside
            # an inline shell string break YAML parsing).
            BODY_FILE=$(mktemp)
            if [ "$DID_FF" = "true" ]; then
              TITLE="chore: sync main → staging (auto, ff to ${MAIN_SHORT})"
              cat > "$BODY_FILE" <<EOFBODY
          Automated fast-forward of \`staging\` to \`origin/main\` (\`${MAIN_SHORT}\`). Staging has no in-flight commits that diverge from main. Merge queue lands this; no human action needed.
          This PR is auto-generated by \`.github/workflows/auto-sync-main-to-staging.yml\` on every push to \`main\`. It exists because this repo's \`staging\` branch has a \`merge_queue\` ruleset that blocks direct pushes — even from the GitHub Actions integration.
          EOFBODY
            else
              TITLE="chore: sync main → staging (auto, merge ${MAIN_SHORT})"
              cat > "$BODY_FILE" <<EOFBODY
          Automated merge of \`origin/main\` (\`${MAIN_SHORT}\`) into \`staging\`. Staging has commits main doesn't, so this is a non-ff merge that absorbs main's tip. Merge queue lands this.
          This PR is auto-generated by \`.github/workflows/auto-sync-main-to-staging.yml\` on every push to \`main\`.
          EOFBODY
            fi
            # gh pr create prints the URL on stdout; extract the PR number.
            PR_URL=$(gh pr create \
              --base staging \
              --head "$BRANCH" \
              --title "$TITLE" \
              --body-file "$BODY_FILE")
            PR_NUM=$(echo "$PR_URL" | grep -oE '[0-9]+$' | tail -1)
            rm -f "$BODY_FILE"
            echo "::notice::Opened PR #${PR_NUM}"
          else
            echo "::notice::Re-using existing PR #${PR_NUM} for ${BRANCH}"
          fi
          # Enable auto-merge — the merge queue picks it up once
          # required gates are green. Use --merge for merge commits
          # (matches the rest of this repo's PR convention).
          if ! gh pr merge "$PR_NUM" --auto --merge 2>&1; then
            echo "::warning::Failed to enable auto-merge on PR #${PR_NUM} — operator may need to merge manually."
          fi
          {
            echo "## ✅ Auto-sync PR opened"
            echo
            echo "- Branch: \`$BRANCH\`"
            echo "- PR: #$PR_NUM"
            echo "- Strategy: $([ "$DID_FF" = "true" ] && echo "ff" || echo "merge commit")"
            echo
            echo "Merge queue lands the PR once required gates are green; no human action needed unless gates fail."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/auto-tag-runtime.yml
+++ b/.github/workflows/auto-tag-runtime.yml
@ -0,0 +1,113 @@
 name: auto-tag-runtime
 # Auto-tag runtime releases on every merge to main that touches workspace/.
 # This is the entry point of the runtime CD chain:
 #
 #   merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template
 #   image rebuilds → repull on hosts.
 #
 # Default bump is patch. Override via PR label `release:minor` or
 # `release:major` BEFORE merging — the label is read off the merged PR
 # associated with the push commit.
 #
 # Skips when:
 #   - The push isn't to main (other branches don't auto-release).
 #   - The merge commit message contains `[skip-release]` (escape hatch
 #     for cleanup PRs that touch workspace/ but shouldn't ship).
 on:
  push:
    branches: [main]
    paths:
      - "workspace/**"
      - "scripts/build_runtime_package.py"
      - ".github/workflows/auto-tag-runtime.yml"
      - ".github/workflows/publish-runtime.yml"
 permissions:
  contents: write    # to push the new tag
  pull-requests: read # to read labels off the merged PR
 concurrency:
  # Serialize tag bumps so two near-simultaneous merges can't both think
  # they're 0.1.6 and race to push the same tag.
  group: auto-tag-runtime
  cancel-in-progress: false
 jobs:
  tag:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0    # need full tag history for `git describe` / sort
      - name: Skip when commit asks
        id: skip
        run: |
          MSG=$(git log -1 --format=%B "${{ github.sha }}")
          if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then
            echo "skip=true" >> "$GITHUB_OUTPUT"
            echo "Commit message contains [skip-release] — no tag will be created."
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi
      - name: Determine bump kind from PR label
        id: bump
        if: steps.skip.outputs.skip != 'true'
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          # The merged PR for this push commit. `gh pr list --search` finds
          # closed PRs whose merge commit matches; we take the first.
          PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "")
          if [ -z "$PR" ] || [ "$PR" = "null" ]; then
            echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump."
            echo "kind=patch" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          LABELS=$(echo "$PR" | jq -r '.labels[].name')
          if echo "$LABELS" | grep -qx 'release:major'; then
            echo "kind=major" >> "$GITHUB_OUTPUT"
          elif echo "$LABELS" | grep -qx 'release:minor'; then
            echo "kind=minor" >> "$GITHUB_OUTPUT"
          else
            echo "kind=patch" >> "$GITHUB_OUTPUT"
          fi
      - name: Compute next version from latest runtime-v* tag
        id: version
        if: steps.skip.outputs.skip != 'true'
        run: |
          # Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver
          # ordering; `grep` filters to the right tag prefix.
          LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1)
          if [ -z "$LATEST" ]; then
            # No prior tag — start the runtime line at 0.1.0.
            CURRENT="0.0.0"
          else
            CURRENT="${LATEST#runtime-v}"
          fi
          MAJOR=$(echo "$CURRENT" | cut -d. -f1)
          MINOR=$(echo "$CURRENT" | cut -d. -f2)
          PATCH=$(echo "$CURRENT" | cut -d. -f3)
          case "${{ steps.bump.outputs.kind }}" in
            major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;;
            minor) MINOR=$((MINOR+1)); PATCH=0;;
            patch) PATCH=$((PATCH+1));;
          esac
          NEW="$MAJOR.$MINOR.$PATCH"
          echo "current=$CURRENT" >> "$GITHUB_OUTPUT"
          echo "new=$NEW" >> "$GITHUB_OUTPUT"
          echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})"
      - name: Push new tag
        if: steps.skip.outputs.skip != 'true'
        run: |
          NEW_TAG="runtime-v${{ steps.version.outputs.new }}"
          git config user.name "github-actions[bot]"
          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
          git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})"
          git push origin "$NEW_TAG"
          echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag."
--- a/.github/workflows/branch-protection-drift.yml
+++ b/.github/workflows/branch-protection-drift.yml
@ -0,0 +1,81 @@
 name: branch-protection drift check
 # Catches out-of-band edits to branch protection (UI clicks, manual gh
 # api PATCH from a one-off ops session) by comparing live state against
 # tools/branch-protection/apply.sh's desired state every day. Fails the
 # workflow when they drift; the failure is the signal.
 #
 # When it fails: re-run apply.sh to put the live state back to the
 # script's intent, OR update apply.sh to encode the new intent and
 # commit. Either way the script is the source of truth.
 on:
  schedule:
    # 14:00 UTC daily. Off-hours for most teams; gives a fresh signal
    # at the start of every working day.
    - cron: '0 14 * * *'
  workflow_dispatch:
  pull_request:
    branches: [staging, main]
    paths:
      - 'tools/branch-protection/**'
      - '.github/workflows/branch-protection-drift.yml'
 permissions:
  contents: read
 jobs:
  drift:
    name: Branch protection drift
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # Token strategy by trigger:
      #
      # - schedule (daily canary): hard-fail when the admin token is
      #   missing. This is the *only* trigger where silent soft-skip is
      #   dangerous — a missing secret on the cron run means the drift
      #   gate has effectively disappeared with no human in the loop to
      #   notice. Per feedback_schedule_vs_dispatch_secrets_hardening.md
      #   the rule is "schedule/automated triggers must hard-fail".
      #
      # - pull_request (touching tools/branch-protection/**): soft-skip
      #   with a prominent warning. A PR cannot retroactively drift the
      #   live state — drift happens *between* PRs (UI clicks, manual
      #   gh api PATCH) and is the schedule's job to catch. The PR-time
      #   gate would only catch typos in apply.sh, which the apply.sh
      #   *_payload unit tests catch better. A human is reviewing the
      #   PR and will see the warning in the workflow log.
      #
      # - workflow_dispatch (operator one-off): soft-skip with warning,
      #   so an operator can run a diagnostic without configuring the
      #   secret first.
      - name: Verify admin token present (hard-fail on schedule only)
        env:
          GH_TOKEN_FOR_ADMIN_API: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
        run: |
          if [[ -n "$GH_TOKEN_FOR_ADMIN_API" ]]; then
            echo "GH_TOKEN_FOR_ADMIN_API present — drift_check will run with admin scope."
            exit 0
          fi
          if [[ "${{ github.event_name }}" == "schedule" ]]; then
            echo "::error::GH_TOKEN_FOR_ADMIN_API secret missing on the daily canary." >&2
            echo "" >&2
            echo "The schedule run is the SoT for branch-protection drift detection." >&2
            echo "Without admin scope it silently passes, hiding any out-of-band edits." >&2
            echo "Set GH_TOKEN_FOR_ADMIN_API at Settings → Secrets and variables → Actions." >&2
            exit 1
          fi
          echo "::warning::GH_TOKEN_FOR_ADMIN_API secret missing — drift_check will be SKIPPED."
          echo "::warning::PR drift checks need repo-admin scope to read /branches/:b/protection."
          echo "::warning::This is non-fatal: the daily schedule run is the canonical drift gate."
          echo "SKIP_DRIFT_CHECK=1" >> "$GITHUB_ENV"
      - name: Run drift check
        if: env.SKIP_DRIFT_CHECK != '1'
        env:
          # Repo-admin scope, needed for /branches/:b/protection.
          GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
        run: bash tools/branch-protection/drift_check.sh
--- a/.github/workflows/canary-staging.yml
+++ b/.github/workflows/canary-staging.yml
@ -20,19 +20,6 @@ on:
    # a few minutes under load — that's fine for a canary.
    - cron: '*/30 * * * *'
  workflow_dispatch:
    inputs:
      keep_on_failure:
        description: >-
          Skip teardown when the canary fails (debugging only). The
          tenant org + EC2 + CF tunnel + DNS stay alive so an operator
          can SSM into the workspace EC2 and capture docker logs of the
          failing claude-code container. REMEMBER to manually delete
          via DELETE /cp/admin/tenants/<slug> when done so the org
          doesn't accumulate cost. Only honored on workflow_dispatch;
          cron runs always tear down (we don't want unattended cron
          to leak resources).
        type: boolean
        default: false
 # Serialise with the full-SaaS workflow so they don't contend for the
 # same org-create quota on staging. Different group key from
@ -93,14 +80,6 @@ jobs:
      # is "Token Plan only" but cheap-per-token and fast.
      E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
      E2E_RUN_ID: "canary-${{ github.run_id }}"
      # Debug-only: when an operator dispatches with keep_on_failure=true,
      # the canary script's E2E_KEEP_ORG=1 path skips teardown so the
      # tenant org + EC2 stay alive for SSM-based log capture. Cron runs
      # never set this (the input only exists on workflow_dispatch) so
      # unattended cron always tears down. See molecule-core#129
      # failure mode #1 — capturing the actual exception requires
      # docker logs from the live container.
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@ -158,28 +137,27 @@ jobs:
        id: canary
        run: bash tests/e2e/test_staging_full_saas.sh
-      # Alerting: open a sticky issue on the FIRST failure; comment on
+      # Alerting: open an issue only after THREE consecutive failures so
-      # subsequent failures; auto-close on next green. Comment-on-existing
+      # transient flakes (Cloudflare DNS hiccup, AWS API blip) don't spam
-      # de-duplicates so a single open issue accumulates the streak —
+      # the issue list. If an issue is already open, we still comment on
-      # ops sees one issue with N comments rather than N issues.
+      # every failure so ops sees the streak. Auto-close on next green.
      #
-      # Why no consecutive-failures threshold (e.g., wait 3 runs before
+      # Threshold rationale: canary fires every 30 min, so 3 failures =
-      # filing): the prior threshold check used
+      # ~90 min of consecutive red — well past any single-run flake but
-      # `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does
+      # still tight enough that a real outage gets surfaced before the
-      # not expose (returns 404). On Gitea Actions the threshold call
+      # next deploy window.
      # ALWAYS failed, breaking the entire alerting step and going days
      # silent on real regressions (38h+ chronic red on 2026-05-07/08
      # before this fix; tracked in molecule-core#129). Filing on first
      # failure is also better UX — we want to know about the first red,
      # not wait 90 min for it to "count." Real flakes get one issue +
      # a quick close-on-green; persistent reds accumulate comments.
      - name: Open issue on failure
        if: failure()
        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
        env:
          # Inject the workflow path explicitly — context.workflow is
          # the *name*, not the file path the actions API needs.
          WORKFLOW_PATH: '.github/workflows/canary-staging.yml'
          CONSECUTIVE_THRESHOLD: '3'
        with:
          script: |
            const title = '🔴 Canary failing: staging SaaS smoke';
-            const runURL = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+            const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
            // Find an existing open canary issue (stable title match).
            // If one exists, this isn't a "first failure" — comment and exit.
@ -199,12 +177,32 @@ jobs:
              return;
            }
-            // No open issue yet — file one on this first failure. The
+            // No open issue yet — check the last N-1 runs' conclusions.
-            // comment-on-existing branch above means subsequent failures
+            // We open the issue only if the last (THRESHOLD-1) runs ALSO
-            // accumulate as comments on this same issue, so we don't
+            // failed (so this is the 3rd consecutive red).
-            // spam new issues per run.
+            const threshold = parseInt(process.env.CONSECUTIVE_THRESHOLD, 10);
            const { data: runs } = await github.rest.actions.listWorkflowRuns({
              owner: context.repo.owner, repo: context.repo.repo,
              workflow_id: process.env.WORKFLOW_PATH,
              status: 'completed',
              per_page: threshold,
              // Skip the current in-progress run; it isn't 'completed' yet.
            });
            // listWorkflowRuns returns recent first. We need (threshold-1)
            // prior failures (current run is the threshold-th).
            const priorFailures = (runs.workflow_runs || [])
              .slice(0, threshold - 1)
              .filter(r => r.id !== context.runId)
              .filter(r => r.conclusion === 'failure')
              .length;
            if (priorFailures < threshold - 1) {
              core.info(`Below threshold: ${priorFailures + 1}/${threshold} consecutive failures — not filing yet`);
              return;
            }
            const body =
-              `Canary run failed at ${new Date().toISOString()}.\n\n` +
+              `Canary run failed at ${new Date().toISOString()}, ` +
              `${threshold} consecutive runs red.\n\n` +
              `Run: ${runURL}\n\n` +
              `This issue auto-closes on the next green canary run. ` +
              `Consecutive failures add a comment here rather than a new issue.`;
@ -213,7 +211,7 @@ jobs:
              title, body,
              labels: ['canary-staging', 'bug'],
            });
-            core.info('Opened canary failure issue (first red)');
+            core.info(`Opened canary failure issue (${threshold} consecutive reds)`);
      - name: Auto-close canary issue on success
        if: success()
--- a/.github/workflows/canary-verify.yml
+++ b/.github/workflows/canary-verify.yml
@ -1,34 +1,19 @@
 name: canary-verify
 # Runs the canary smoke suite against the staging canary tenant fleet
-# after a new :staging-<sha> image lands in ECR. On green, calls the
+# after a new :staging-<sha> image lands in GHCR. On green, promotes
-# CP redeploy-fleet endpoint to promote :staging-<sha> → :latest so
+# :staging-<sha> → :latest so the prod tenant fleet's 5-minute
-# the prod tenant fleet's 5-minute auto-updater picks up the verified
+# auto-updater picks up the verified digest. On red, :latest stays
-# digest. On red, :latest stays on the prior known-good digest and
+# on the prior known-good digest and prod is untouched.
 # prod is untouched.
 #
 # Registry note (2026-05-10): This workflow previously used GHCR
 # (ghcr.io/molecule-ai/platform-tenant) — that registry was retired
 # during the 2026-05-06 Gitea suspension migration when publish-
 # workspace-server-image.yml switched to the operator's ECR org
 # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/
 # platform-tenant). The GHCR → ECR migration was never applied to
 # this file, so canary-verify was silently smoke-testing the stale
 # GHCR image while the actual staging/prod tenants ran the ECR image.
 # Result: smoke tests could not catch a broken ECR build. Fix:
 #   - Wait step: reads SHA from running canary /health (tenant-
 #     agnostic, works regardless of registry).
 #   - Promote step: calls CP redeploy-fleet endpoint with target_tag=
 #     staging-<sha>, same mechanism as redeploy-tenants-on-main.yml.
 #     No longer attempts GHCR crane ops.
 #
 # Dependencies:
 #   - publish-workspace-server-image.yml publishes :staging-<sha>
-#     to ECR on staging and main merges.
+#     (NOT :latest) on main merge
-#   - Canary tenants are configured to pull :staging-<sha> from ECR
+#   - canary tenants are configured to pull :staging-<sha> as their
-#     (TENANT_IMAGE env set to the ECR :staging-<sha> tag).
+#     tenant image (set TENANT_IMAGE=ghcr.io/…:staging-<sha> on the
 #     canary provisioner code path OR rotate via an admin endpoint)
 #   - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS /
-#     CANARY_CP_SHARED_SECRET are populated.
+#     CANARY_CP_SHARED_SECRET are populated
 on:
  workflow_run:
@ -42,12 +27,8 @@ permissions:
  actions: read
 env:
-  # ECR registry (post-2026-05-06 SSOT for tenant images).
+  IMAGE_NAME: ghcr.io/molecule-ai/platform
-  # publish-workspace-server-image.yml pushes here.
+  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
  # CP endpoint for redeploy-fleet (used in promote step below).
  CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
 jobs:
  canary-smoke:
@ -71,12 +52,6 @@ jobs:
        # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to
        # proceeding after 7 min even if not all canaries responded —
        # the smoke suite will catch any that didn't update.
        #
        # NOTE: The SHA is read from the running tenant's /health response,
        # NOT from a registry lookup. This is registry-agnostic and works
        # regardless of whether the tenant pulls from ECR, GHCR, or any
        # other registry — the canary is telling us what it's actually
        # running, which is the ground truth for smoke testing.
        env:
          CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
          EXPECTED_SHA: ${{ steps.compute.outputs.sha }}
@ -133,7 +108,7 @@ jobs:
              echo
              echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)."
              echo "Phase 2 canary fleet has not been stood up yet —"
-              echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
+              echo "see [canary-tenants.md](https://github.com/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
              echo
              echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready."
            } >> "$GITHUB_STEP_SUMMARY"
@ -158,98 +133,42 @@ jobs:
          } >> "$GITHUB_STEP_SUMMARY"
  promote-to-latest:
-    # On green, calls the CP redeploy-fleet endpoint with target_tag=
+    # On green, retag :staging-<sha> → :latest for BOTH images.
-    # staging-<sha> to promote the verified ECR image. This is the same
+    # crane is a lightweight registry client (no Docker daemon needed on
-    # mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops.
+    # the runner) that can retag remotely with a single API call each.
-    #
+    # Gated on smoke_ran=true — without a real canary fleet the smoke
-    # Pre-fix history: the old GHCR promote step used `crane tag` against
+    # step no-ops with success, and we don't want that to silently
-    # ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server-
+    # auto-promote every main merge.
    # image.yml had already migrated to ECR on 2026-05-07 (commit
    # 10e510f5). The GHCR tags were never updated, so this step was
    # silently promoting a stale GHCR image while actual prod tenants
    # pulled from ECR. Canary smoke tests were GHCR-targeted and could
    # not catch a broken ECR build.
    needs: canary-smoke
    if: ${{ needs.canary-smoke.result == 'success' && needs.canary-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    env:
      SHA: ${{ needs.canary-smoke.outputs.sha }}
      CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
      # CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint.
      # Stored at the repo level so all workflows pick it up automatically.
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      # canary_slug pin: deploy the verified :staging-<sha> to the canary
      # first (soak 120s), then fan out to the rest of the fleet.
      CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }}
      SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }}
      BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }}
    steps:
-      - name: Check CP credentials
+      - uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
      - name: GHCR login
        run: |
-          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
+          echo "${{ secrets.GITHUB_TOKEN }}" | \
-            echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet."
+            crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
            echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret."
            exit 1
          fi
-      - name: Promote verified ECR image to :latest
+      - name: Retag platform :staging-<sha> → :latest
        run: |
-          set -euo pipefail
+          crane tag \
            "${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
            latest
-          TARGET_TAG="staging-${SHA}"
+      - name: Retag tenant :staging-<sha> → :latest
-          BODY=$(jq -nc \
+        run: |
-            --arg tag "$TARGET_TAG" \
+          crane tag \
-            --argjson soak "${SOAK_SECONDS:-120}" \
+            "${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
-            --argjson batch "${BATCH_SIZE:-3}" \
+            latest
            --argjson dry false \
            '{
              target_tag: $tag,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          if [ -n "${CANARY_SLUG:-}" ]; then
            BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY")
          fi
          echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  target_tag: $TARGET_TAG"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          CURL_EXIT=$?
          set -e
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          if [ "$HTTP_CODE" -ge 400 ]; then
            echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed."
            exit 1
          fi
      - name: Summary
        run: |
          {
-            echo "## Canary verified — :latest promoted via CP redeploy-fleet"
+            echo "## Canary verified — :latest promoted"
-            echo ""
+            echo
-            echo "- **Target tag:** \`staging-${{ needs.canary-smoke.outputs.sha }}\`"
+            echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`"
-            echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)"
+            echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`"
-            echo "- **Canary slug:** \`${CANARY_SLUG:-<none>}\` (soak ${SOAK_SECONDS}s)"
+            echo
-            echo "- **Batch size:** ${BATCH_SIZE:-3}"
+            echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle."
            echo ""
            echo "CP redeploy-fleet is rolling out the verified image across the prod fleet."
            echo "The fleet's 5-minute health-check loop will pick up the update automatically."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/check-merge-group-trigger.yml
+++ b/.github/workflows/check-merge-group-trigger.yml
@ -0,0 +1,123 @@
 name: Check merge_group trigger on required workflows
 # Pre-merge guard against the deadlock pattern where a workflow whose
 # check is in `required_status_checks` lacks a `merge_group:` trigger.
 # Without it, GitHub merge queue stalls forever in AWAITING_CHECKS
 # because the required check can't fire on `gh-readonly-queue/...` refs.
 #
 # This workflow:
 #   1. Lists required status checks on the branch protection rule for `staging`
 #   2. For each required check, finds the workflow that produces it (by job
 #      name match)
 #   3. Fails if any such workflow lacks `merge_group:` in its triggers
 #
 # Reasoning for staging-only: main has its own CI gating model (PR review),
 # but staging is what the merge queue runs on, so it's the trigger that
 # matters.
 on:
  pull_request:
    paths:
      - '.github/workflows/**.yml'
      - '.github/workflows/**.yaml'
  push:
    branches: [staging, main]
    paths:
      - '.github/workflows/**.yml'
      - '.github/workflows/**.yaml'
  # Self-listen on merge_group so the linter passes its own queue run.
  merge_group:
    types: [checks_requested]
 jobs:
  check:
    name: Required workflows have merge_group trigger
    runs-on: ubuntu-latest
    permissions:
      contents: read
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify merge_group trigger on required-check workflows
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
        shell: bash
        run: |
          set -euo pipefail
          # Branch we care about — the one merge queue runs on.
          BRANCH=staging
          # Pull the list of required status check contexts. If the branch
          # has no protection or no required checks, exit clean — nothing
          # to lint.
          REQUIRED=$(gh api "repos/${REPO}/branches/${BRANCH}/protection/required_status_checks" \
            --jq '.contexts[]' 2>/dev/null || true)
          if [ -z "$REQUIRED" ]; then
            echo "No required status checks on ${BRANCH} — nothing to verify."
            exit 0
          fi
          echo "Required checks on ${BRANCH}:"
          echo "${REQUIRED}" | sed 's/^/  - /'
          echo
          # Build a map: workflow file -> set of job names declared in it.
          # We use yq if available, otherwise grep the `name:` lines under
          # `jobs:`. Stick with grep for portability — runner image always
          # has it; yq isn't in the default image as of 2026-04.
          declare -A workflow_jobs
          shopt -s nullglob
          for wf in .github/workflows/*.yml .github/workflows/*.yaml; do
            [ -f "$wf" ] || continue
            # Extract the workflow name (the `name:` at file root).
            wf_name=$(awk '/^name:[[:space:]]/ {sub(/^name:[[:space:]]+/,""); gsub(/^"|"$/,""); print; exit}' "$wf")
            # Extract job step names from the `jobs:` block. A job step is:
            #   - id under `jobs:` (key with 2-space indent followed by colon)
            #   - the `name:` field inside that job (4-space indent)
            # We collect both because required_status_checks contexts can
            # match either, depending on how the workflow was authored.
            jobs_block=$(awk '/^jobs:/{flag=1; next} flag' "$wf")
            job_names=$(echo "$jobs_block" | awk '/^[[:space:]]{4}name:[[:space:]]/ {sub(/^[[:space:]]+name:[[:space:]]+/,""); gsub(/^["'"'"']|["'"'"']$/,""); print}')
            workflow_jobs["$wf"]="${wf_name}"$'\n'"${job_names}"
          done
          # For each required check, find the workflow that produces it.
          # Then verify that workflow lists merge_group as a trigger.
          FAILED=0
          while IFS= read -r check; do
            [ -z "$check" ] && continue
            owning_wf=""
            for wf in "${!workflow_jobs[@]}"; do
              if echo "${workflow_jobs[$wf]}" | grep -Fxq "$check"; then
                owning_wf="$wf"
                break
              fi
            done
            if [ -z "$owning_wf" ]; then
              echo "::warning::Required check '${check}' has no matching workflow in this repo. Skipping (may be from an external app)."
              continue
            fi
            # Does the workflow's trigger list include merge_group?
            # Match either bare `merge_group:` line or merge_group with
            # subsequent indented config (types: [checks_requested]).
            if grep -qE '^[[:space:]]*merge_group:' "$owning_wf"; then
              echo "OK: '${check}' (in $owning_wf) — has merge_group trigger"
            else
              echo "::error file=${owning_wf}::Required check '${check}' is produced by ${owning_wf}, but the workflow does not declare a 'merge_group:' trigger. With merge queue enabled on ${BRANCH}, this will deadlock the queue (every PR sits AWAITING_CHECKS forever). Add this to the workflow's 'on:' block:"
              echo "::error file=${owning_wf}::  merge_group:"
              echo "::error file=${owning_wf}::    types: [checks_requested]"
              FAILED=1
            fi
          done <<< "$REQUIRED"
          if [ "$FAILED" -ne 0 ]; then
            echo
            echo "::error::Block. See errors above. Reference: $(grep -l 'reference_merge_queue' /dev/null 2>/dev/null || echo 'memory: reference_merge_queue_enablement.md')."
            exit 1
          fi
          echo
          echo "All required workflows on ${BRANCH} declare merge_group triggers."
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -235,13 +235,7 @@ jobs:
        run: npx vitest run --coverage
      - name: Upload coverage summary as artifact
        if: needs.changes.outputs.canvas == 'true' && always()
-        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
        # implement, surfacing as `GHESNotSupportedError: @actions/artifact
        # v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not
        # currently supported on GHES`. Drop this pin when Gitea ships
        # the v4 protocol (tracked: post-Gitea-1.23 followup).
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: canvas-coverage-${{ github.run_id }}
          path: canvas/coverage/
@ -304,9 +298,13 @@ jobs:
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
    if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
    permissions:
      # Required to post commit comments via the GitHub API.
      contents: write
    steps:
-      - name: Write deploy reminder to step summary
+      - name: Post deploy reminder as commit comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMMIT_SHA: ${{ github.sha }}
          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
        run: |
@ -333,13 +331,10 @@ jobs:
          printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
            "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
-          # Gitea has no commit-comments API (no equivalent of
+          gh api \
-          # POST /repos/{owner}/{repo}/commits/{commit_sha}/comments).
+            --method POST \
-          # Write to GITHUB_STEP_SUMMARY instead — both GitHub Actions and
+            "repos/${{ github.repository }}/commits/${{ github.sha }}/comments" \
-          # Gitea Actions render this as the workflow run's summary page,
+            --field "body=@/tmp/deploy-reminder.md"
          # which is where operators look for post-deploy action items.
          # (#75 / PR-D)
          cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
  # Python Lint & Test — required check, always runs. See platform-build
  # for the rationale.
@ -365,7 +360,7 @@ jobs:
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - if: needs.changes.outputs.python == 'true'
-        run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0
+        run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov
      # Coverage flags + fail-under floor moved into workspace/pytest.ini
      # (issue #1817) so local `pytest` and CI use identical config.
      - if: needs.changes.outputs.python == 'true'
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -0,0 +1,122 @@
 name: CodeQL
 # Controls CodeQL scan triggers for this repo.
 #
 # GitHub's "Code quality" default setup (the UI-configured one) is
 # hardcoded to only scan the default branch — on this repo that's
 # `staging`, so PRs promoting staging→main would otherwise never be
 # scanned. This workflow fills that gap by explicitly scanning both
 # branches on push and PR.
 #
 # Runs on ubuntu-latest (GHA-hosted — public repo, free). GHAS is NOT
 # enabled on this repo, so results are not uploaded to the Security
 # tab — the scan fails the PR check on findings, and the SARIF is
 # kept as a workflow artifact for triage.
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
  # GitHub merge queue fires `merge_group` for the queue's pre-merge CI run.
  # Required so CodeQL Analyze checks get a real result on the queued
  # commit instead of a false-green. Event only fires once merge queue is
  # enabled on the target branch — safe to add unconditionally.
  merge_group:
    types: [checks_requested]
  schedule:
    # Weekly run picks up findings in code that hasn't been touched.
    - cron: '30 1 * * 0'
 # Workflow-level concurrency: only one CodeQL run per branch/PR at a time.
 # `cancel-in-progress: false` queues new runs so a quick follow-up push
 # doesn't nuke a 45-min analysis mid-flight.
 concurrency:
  group: codeql-${{ github.ref }}
  cancel-in-progress: false
 permissions:
  actions: read
  contents: read
  # No security-events: write — we don't call the upload API.
 jobs:
  analyze:
    name: Analyze (${{ matrix.language }})
    # CodeQL set to advisory (non-blocking) on Gitea Actions — Hongming dec'''n 2026-05-07 (#156).
    # Findings still emit as SARIF artifacts; failing CodeQL run does not block PR merge.
    continue-on-error: true
    runs-on: ubuntu-latest
    timeout-minutes: 45
    strategy:
      fail-fast: false
      matrix:
        language: [go, javascript-typescript, python]
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # plugin was dropped + the Dockerfile no longer needs it.
      # jq is pre-installed on ubuntu-latest — no setup step needed.
      - name: Initialize CodeQL
        uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
        with:
          languages: ${{ matrix.language }}
          # security-extended widens past the default to include the
          # full security-query set for a public SaaS surface.
          queries: security-extended
      - name: Autobuild
        uses: github/codeql-action/autobuild@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
      - name: Perform CodeQL Analysis
        id: analyze
        uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
        with:
          category: "/language:${{ matrix.language }}"
          # upload: never — GHAS isn't enabled on this repo, so the
          # upload API 403s. Write SARIF locally instead.
          upload: never
          output: sarif-results/${{ matrix.language }}
      - name: Parse SARIF + fail on findings
        # The analyze step writes <database>.sarif into the output
        # directory — database name is the short CodeQL lang id, not
        # the matrix value (e.g. "javascript-typescript" →
        # javascript.sarif), so glob rather than hardcode.
        # Filter to error/warning severity: security-extended emits
        # "note" rows for informational findings we don't want to fail
        # the build over.
        shell: bash
        run: |
          set -euo pipefail
          dir="sarif-results/${{ matrix.language }}"
          sarif=$(ls "$dir"/*.sarif 2>/dev/null | head -1 || true)
          if [ -z "$sarif" ] || [ ! -f "$sarif" ]; then
            echo "::error::No SARIF file found under $dir"
            ls -la "$dir" 2>/dev/null || true
            exit 1
          fi
          echo "Parsing $sarif"
          count=$(jq '[.runs[].results[] | select(.level == "error" or .level == "warning")] | length' "$sarif")
          echo "CodeQL findings (error+warning) for ${{ matrix.language }}: $count"
          if [ "$count" -gt 0 ]; then
            echo "::error::CodeQL found $count issues. Details below; full SARIF in the artifact."
            jq -r '.runs[].results[] | select(.level == "error" or .level == "warning") | "  - [\(.level)] \(.ruleId // "?"): \(.message.text // "(no message)") @ \(.locations[0].physicalLocation.artifactLocation.uri // "?"):\(.locations[0].physicalLocation.region.startLine // "?")"' "$sarif"
            exit 1
          fi
      - name: Upload SARIF artifact
        # Keep SARIF around on success + failure so triagers can diff.
        # 14-day retention — longer than default 3, short enough not
        # to bloat quota.
        if: always()
        uses: actions/upload-artifact@v3 # pinned to v3 for Gitea act_runner v0.6 compatibility (internal#46)
        with:
          name: codeql-sarif-${{ matrix.language }}
          path: sarif-results/${{ matrix.language }}/
          retention-days: 14
--- a/.github/workflows/e2e-api.yml
+++ b/.github/workflows/e2e-api.yml
@ -12,59 +12,6 @@ name: E2E API Smoke Test
 # spending CI cycles. See the in-job comment on the `e2e-api` job for
 # why this is one job (not two-jobs-sharing-name) and the 2026-04-29
 # PR #2264 incident that drove the consolidation.
 #
 # Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
 # -------------------------------------------------------------------
 # Same substrate hazard as PR #98 (handlers-postgres-integration). Our
 # Gitea act_runner runs with `container.network: host` (operator host
 # `/opt/molecule/runners/config.yaml`), which means:
 #
 #   * Two concurrent runs both try to bind their `-p 15432:5432` /
 #     `-p 16379:6379` host ports — the second postgres/redis FATALs
 #     with `Address in use` and `docker run` returns exit 125 with
 #     `Conflict. The container name "/molecule-ci-postgres" is already
 #     in use by container ...`. Verified in run a7/2727 on 2026-05-07.
 #   * The fixed container names `molecule-ci-postgres` / `-redis` (the
 #     pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
 #     `docker rm -f` at the start of the second job KILLS the first
 #     job's still-running postgres/redis.
 #
 # Fix shape (mirrors PR #98's bridge-net pattern, adapted because
 # platform-server is a Go binary on the host, not a containerised
 # step):
 #
 #   1. Unique container names per run:
 #         pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
 #         redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
 #      `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
 #      same run_id.
 #   2. Ephemeral host port per run (`-p 0:5432`), then read the actual
 #      bound port via `docker port` and export DATABASE_URL/REDIS_URL
 #      pointing at it. No fixed host-port → no port collision.
 #   3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
 #      the original flake fixed in #92 and the script's still IPv6-
 #      enabled.
 #   4. `if: always()` cleanup so containers don't leak when test steps
 #      fail.
 #
 # Issue #94 items #2 + #3 (also fixed here):
 #   * Pre-pull `alpine:latest` so the platform-server's provisioner
 #     (`internal/handlers/container_files.go`) can stand up its
 #     ephemeral token-write helper without a daemon.io round-trip.
 #   * Create `molecule-core-net` bridge network if missing so the
 #     provisioner's container.HostConfig {NetworkMode: ...} attach
 #     succeeds.
 # Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
 # 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
 # they DO come up. Timeouts are not the bottleneck; not bumped.
 #
 # Item explicitly NOT fixed here: failing test `Status back online`
 # fails because the platform's langgraph workspace template image
 # (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
 # 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
 # template-registry resolution issue (ADR-002 / local-build mode) and
 # belongs in a separate change that touches workspace-server, not
 # this workflow file.
 on:
  push:
@ -131,14 +78,11 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 15
    env:
-      # Unique per-run container names so concurrent runs on the host-
+      DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable
-      # network act_runner don't collide on name OR port.
+      REDIS_URL: redis://localhost:16379
      # `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
      # same run_id. PORT is set later (after docker port lookup) since
      # we let Docker assign an ephemeral host port.
      PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      PORT: "8080"
      PG_CONTAINER: molecule-ci-postgres
      REDIS_CONTAINER: molecule-ci-redis
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.api != 'true'
@ -153,53 +97,11 @@ jobs:
          go-version: 'stable'
          cache: true
          cache-dependency-path: workspace-server/go.sum
      - name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          # Provisioner uses alpine:latest for ephemeral token-write
          # containers (workspace-server/internal/handlers/container_files.go).
          # Pre-pull so the first provision in test_api.sh doesn't race
          # the daemon's pull cache. Idempotent — `docker pull` is a no-op
          # when the image is already present.
          docker pull alpine:latest >/dev/null
          # Provisioner attaches workspace containers to
          # molecule-core-net (workspace-server/internal/provisioner/
          # provisioner.go::DefaultNetwork). The bridge already exists on
          # the operator host's docker daemon — `network create` is
          # idempotent via `|| true`.
          docker network create molecule-core-net >/dev/null 2>&1 || true
          echo "alpine:latest pre-pulled; molecule-core-net ensured."
      - name: Start Postgres (docker)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          # Defensive cleanup — only matches THIS run's container name,
          # so it cannot kill a sibling run's postgres. (Pre-fix the
          # name was static and this rm hit other runs' containers.)
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
-          # `-p 0:5432` requests an ephemeral host port; we read it back
+          docker run -d --name "$PG_CONTAINER" -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule -p 15432:5432 postgres:16
          # below and export DATABASE_URL.
          docker run -d --name "$PG_CONTAINER" \
            -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
            -p 0:5432 postgres:16 >/dev/null
          # Resolve the host-side port assignment. `docker port` prints
          # `0.0.0.0:NNNN` (and on host-net runners may also print an
          # IPv6 line — take the first IPv4 line).
          PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          if [ -z "$PG_PORT" ]; then
            # Fallback: any first line. Some Docker versions print only
            # one line.
            PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
          fi
          if [ -z "$PG_PORT" ]; then
            echo "::error::Could not resolve host port for $PG_CONTAINER"
            docker port "$PG_CONTAINER" 5432/tcp || true
            docker logs "$PG_CONTAINER" || true
            exit 1
          fi
          # 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
          echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
          echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
          echo "Postgres host port: ${PG_PORT}"
          for i in $(seq 1 30); do
            if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
              echo "Postgres ready after ${i}s"
@ -214,20 +116,7 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
-          docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
+          docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7
          REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          if [ -z "$REDIS_PORT" ]; then
            REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
          fi
          if [ -z "$REDIS_PORT" ]; then
            echo "::error::Could not resolve host port for $REDIS_CONTAINER"
            docker port "$REDIS_CONTAINER" 6379/tcp || true
            docker logs "$REDIS_CONTAINER" || true
            exit 1
          fi
          echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
          echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
          echo "Redis host port: ${REDIS_PORT}"
          for i in $(seq 1 15); do
            if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
              echo "Redis ready after ${i}s"
@ -246,15 +135,13 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: |
          # DATABASE_URL + REDIS_URL exported by the start-postgres /
          # start-redis steps point at this run's per-run host ports.
          ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid
      - name: Wait for /health
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          for i in $(seq 1 30); do
-            if curl -sf http://127.0.0.1:8080/health > /dev/null; then
+            if curl -sf http://localhost:8080/health > /dev/null; then
              echo "Platform up after ${i}s"
              exit 0
            fi
@ -298,9 +185,6 @@ jobs:
            kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
          fi
      - name: Stop service containers
        # always() so containers don't leak when test steps fail. The
        # cleanup is best-effort: if the container is already gone
        # (e.g. concurrent rerun race), don't fail the job.
        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
--- a/.github/workflows/e2e-staging-canvas.yml
+++ b/.github/workflows/e2e-staging-canvas.yml
@ -22,9 +22,9 @@ on:
  # spending CI cycles. See e2e-api.yml for the rationale on why this
  # is a single job rather than two-jobs-sharing-name.
  push:
-    branches: [main]
+    branches: [main, staging]
  pull_request:
-    branches: [main]
+    branches: [main, staging]
  workflow_dispatch:
  schedule:
    # Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
@ -139,11 +139,7 @@ jobs:
      - name: Upload Playwright report on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
-        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
        # implement (see ci.yml upload step for the canonical error
        # cite). Drop this pin when Gitea ships the v4 protocol.
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: playwright-report-staging
          path: canvas/playwright-report-staging/
@ -151,8 +147,7 @@ jobs:
      - name: Upload screenshots on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
-        # Pinned to v3 for Gitea act_runner v0.6 compatibility (see above).
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: playwright-screenshots
          path: canvas/test-results/
--- a/.github/workflows/e2e-staging-external.yml
+++ b/.github/workflows/e2e-staging-external.yml
@ -32,7 +32,7 @@ name: E2E Staging External Runtime
 on:
  push:
-    branches: [main]
+    branches: [staging, main]
    paths:
      - 'workspace-server/internal/handlers/workspace.go'
      - 'workspace-server/internal/handlers/registry.go'
@ -44,7 +44,7 @@ on:
      - 'tests/e2e/test_staging_external_runtime.sh'
      - '.github/workflows/e2e-staging-external.yml'
  pull_request:
-    branches: [main]
+    branches: [staging, main]
    paths:
      - 'workspace-server/internal/handlers/workspace.go'
      - 'workspace-server/internal/handlers/registry.go'
--- a/.github/workflows/e2e-staging-saas.yml
+++ b/.github/workflows/e2e-staging-saas.yml
@ -20,12 +20,13 @@ name: E2E Staging SaaS (full lifecycle)
 #     via the same paths watcher that e2e-api.yml uses)
 on:
-  # Trunk-based (Phase 3 of internal#81): main is the only branch.
+  # Fire on staging push too — previously this only ran on main, which
-  # Previously this fired on staging push too because staging was a
+  # meant the most thorough end-to-end test caught regressions AFTER
-  # superset of main and ran the gate ahead of auto-promote; with no
+  # they shipped to staging (and then to the auto-promote PR). Running
-  # staging branch, main is where E2E gates the deploy.
+  # on staging push catches them BEFORE the staging→main promotion
  # opens, so a green canary into auto-promote is more meaningful.
  push:
-    branches: [main]
+    branches: [staging, main]
    paths:
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_provision.go'
@ -35,7 +36,7 @@ on:
      - 'tests/e2e/test_staging_full_saas.sh'
      - '.github/workflows/e2e-staging-saas.yml'
  pull_request:
-    branches: [main]
+    branches: [staging, main]
    paths:
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_provision.go'
--- a/.github/workflows/handlers-postgres-integration.yml
+++ b/.github/workflows/handlers-postgres-integration.yml
@ -14,42 +14,12 @@ name: Handlers Postgres Integration
 # self-review caught it took 2 minutes to set up and would have caught
 # the bug at PR-time.
 #
-# Why this workflow does NOT use `services: postgres:` (Class B fix)
+# This job spins a Postgres service container, applies the migration,
-# ------------------------------------------------------------------
+# and runs `go test -tags=integration` against a live DB. Required
-# Our act_runner config has `container.network: host` (operator host
+# check on staging branch protection — backend handler PRs cannot
-# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
+# merge without a real-DB regression gate.
 # the job container AND every service container. With host-net, two
 # concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
 # second postgres FATALs with `could not create any TCP/IP sockets:
 # Address in use`, and Docker auto-removes it (act_runner sets
 # AutoRemove:true on service containers). By the time the migrations
 # step runs `psql`, the postgres container is gone, hence
 # `Connection refused` then `failed to remove container: No such
 # container` at cleanup time.
 #
-# Per-job `container.network` override is silently ignored by
+# Cost: ~30s job (postgres pull from GH cache + go build + 4 tests).
 # act_runner — `--network and --net in the options will be ignored.`
 # appears in the runner log. Documented constraint.
 #
 # So we sidestep `services:` entirely. The job container still uses
 # host-net (inherited from runner config; required for cache server
 # discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
 # postgres on the existing `molecule-core-net` bridge with a
 # UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
 # read its bridge IP via `docker inspect`. A host-net job container
 # can reach a bridge-net container directly via the bridge IP (verified
 # manually on operator host 2026-05-08).
 #
 # Trade-offs vs. the original `services:` shape:
 #   + No host-port collision; N parallel runs share the bridge cleanly
 #   + `if: always()` cleanup runs even on test-step failure
 #   - One more step in the workflow (+~3 lines)
 #   - Requires `molecule-core-net` to exist on the operator host
 #     (it does; declared in docker-compose.yml + docker-compose.infra.yml)
 #
 # Class B Hongming-owned CICD red sweep, 2026-05-08.
 #
 # Cost: ~30s job (postgres pull from cache + go build + 4 tests).
 on:
  push:
@ -89,14 +59,20 @@ jobs:
    name: Handlers Postgres Integration
    needs: detect-changes
    runs-on: ubuntu-latest
    services:
      postgres:
        image: postgres:15-alpine
        env:
-      # Unique name per run so concurrent jobs don't collide on the
+          POSTGRES_PASSWORD: test
-      # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
+          POSTGRES_DB: molecule
-      # workflow_dispatch reruns of the same run_id.
+        ports:
-      PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
+          - 5432:5432
-      # Bridge network already exists on the operator host (declared
+        # GHA spins this with --health-cmd built in for postgres images.
-      # in docker-compose.yml + docker-compose.infra.yml).
+        options: >-
-      PG_NETWORK: molecule-core-net
+          --health-cmd pg_isready
          --health-interval 5s
          --health-timeout 5s
          --health-retries 10
    defaults:
      run:
        working-directory: workspace-server
@ -113,57 +89,16 @@ jobs:
        with:
          go-version: 'stable'
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Start sibling Postgres on bridge network
        working-directory: .
        run: |
          # Sanity: the bridge network must exist on the operator host.
          # Hard-fail loud if it doesn't — easier to spot than a silent
          # auto-create that diverges from the rest of the stack.
          if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
            echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
            exit 1
          fi
          # If a stale container with the same name exists (rerun on
          # the same run_id), wipe it first.
          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
          docker run -d \
            --name "${PG_NAME}" \
            --network "${PG_NETWORK}" \
            --health-cmd "pg_isready -U postgres" \
            --health-interval 5s \
            --health-timeout 5s \
            --health-retries 10 \
            -e POSTGRES_PASSWORD=test \
            -e POSTGRES_DB=molecule \
            postgres:15-alpine >/dev/null
          # Read back the bridge IP. Always present immediately after
          # `docker run -d` for bridge networks.
          PG_HOST=$(docker inspect "${PG_NAME}" \
            --format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
          if [ -z "${PG_HOST}" ]; then
            echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
            docker logs "${PG_NAME}" || true
            exit 1
          fi
          echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
          echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
          echo "Started ${PG_NAME} at ${PG_HOST}:5432"
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Apply migrations to Postgres service
        env:
          PGPASSWORD: test
        run: |
-          # Wait for postgres to actually accept connections. Docker's
+          # Wait for postgres to actually accept connections (the
-          # health-cmd handles container-side readiness, but the wire
+          # GHA --health-cmd is best-effort but psql can still race).
          # to the bridge IP is best-tested with pg_isready directly.
          for i in {1..15}; do
-            if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
+            if pg_isready -h localhost -p 5432 -U postgres -q; then break; fi
-            echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
+            echo "waiting for postgres..."; sleep 2
          done
          # Apply every .up.sql in lexicographic order with
@ -196,7 +131,7 @@ jobs:
          # not fine once a cross-table atomicity test came in.
          set +e
          for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
-            if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
+            if psql -h localhost -U postgres -d molecule -v ON_ERROR_STOP=1 \
                  -f "$migration" >/dev/null 2>&1; then
              echo "✓ $(basename "$migration")"
            else
@ -210,7 +145,7 @@ jobs:
          # fail if any didn't land — that would be a real regression we
          # want loud.
          for tbl in delegations workspaces activity_logs pending_uploads; do
-            if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
+            if ! psql -h localhost -U postgres -d molecule -tA \
                -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
                | grep -q 1; then
              echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
@ -221,32 +156,16 @@ jobs:
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Run integration tests
        env:
          INTEGRATION_DB_URL: postgres://postgres:test@localhost:5432/molecule?sslmode=disable
        run: |
          # INTEGRATION_DB_URL is exported by the start-postgres step;
          # points at the per-run bridge IP, not 127.0.0.1, so concurrent
          # workflow runs don't fight over a host-net 5432 port.
          go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
-      - if: failure() && needs.detect-changes.outputs.handlers == 'true'
+      - if: needs.detect-changes.outputs.handlers == 'true' && failure()
        name: Diagnostic dump on failure
        env:
          PGPASSWORD: test
        run: |
          echo "::group::postgres container status"
          docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
          docker logs "${PG_NAME}" 2>&1 | tail -50 || true
          echo "::endgroup::"
          echo "::group::delegations table state"
-          psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
+          psql -h localhost -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
          echo "::endgroup::"
      - if: always() && needs.detect-changes.outputs.handlers == 'true'
        name: Stop sibling Postgres
        working-directory: .
        run: |
          # always() so containers don't leak when migrations or tests
          # fail. The cleanup is best-effort: if the container is
          # already gone (e.g. concurrent rerun race), don't fail the job.
          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
          echo "Cleaned up ${PG_NAME}"
--- a/.github/workflows/harness-replays.yml
+++ b/.github/workflows/harness-replays.yml
@ -56,40 +56,21 @@ jobs:
      run: ${{ steps.decide.outputs.run }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
        id: filter
        with:
          filters: |
            run:
              - 'workspace-server/**'
              - 'canvas/**'
              - 'tests/harness/**'
              - '.github/workflows/harness-replays.yml'
      - id: decide
        run: |
          # workflow_dispatch: always run (manual trigger)
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=manual-trigger" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Determine the base commit to diff against.
          # For pull_request: use base.sha (the merge-base with main/staging).
          # For push: use github.event.before (the previous tip of the branch).
          # Fallback for new branches (all-zeros SHA): run everything.
          if [ "${{ github.event_name }}" = "pull_request" ] && \
             [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          elif [ -n "${{ github.event.before }}" ] && \
               ! echo "${{ github.event.before }}" | grep -qE '^0+$'; then
            BASE="${{ github.event.before }}"
          else
-            # New branch or github.event.before unavailable — run everything.
+            echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT"
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=new-branch-fallback" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # GitHub Actions and Gitea Actions both expose github.sha for HEAD.
          DIFF=$(git diff --name-only "$BASE" "${{ github.sha }}" 2>/dev/null)
          echo "debug=diff-base=$BASE diff-files=$DIFF" >> "$GITHUB_OUTPUT"
          if echo "$DIFF" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.github/workflows/harness-replays\.yml$'; then
            echo "run=true" >> "$GITHUB_OUTPUT"
          else
            echo "run=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job that always runs. Real work is gated per-step on
@ -110,80 +91,13 @@ jobs:
        run: |
          echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
          echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
          echo "::notice::Debug: ${{ needs.detect-changes.outputs.debug }}"
      - if: needs.detect-changes.outputs.run == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # Log what files were detected so future failures include the diff.
      - name: Log detected changes
        if: needs.detect-changes.outputs.run == 'true'
        run: |
          echo "::notice::detect-changes debug: ${{ needs.detect-changes.outputs.debug }}"
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # the plugin was dropped + Dockerfile.tenant no longer COPYs it.
      # Pre-clone manifest deps before docker compose builds the tenant
      # image (Task #173 followup — same pattern as
      # publish-workspace-server-image.yml's "Pre-clone manifest deps"
      # step).
      #
      # Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha
      # and tenant-beta from workspace-server/Dockerfile.tenant with
      # context=../.. (repo root). That Dockerfile expects
      # .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins}
      # to be present at build context root (post-#173 it COPYs from there
      # instead of running an in-image clone — the in-image clone failed
      # with "could not read Username for https://git.moleculesai.app"
      # because there's no auth path inside the build sandbox).
      #
      # Without this step harness-replays fails before any replay runs,
      # with `failed to calculate checksum of ref ...
      # "/.tenant-bundle-deps/plugins": not found`. Caught by run #892
      # (main, 2026-05-07T20:28:53Z) and run #964 (staging — same
      # symptom, different root cause: staging still has the in-image
      # clone path, hits the auth error directly).
      #
      # 2026-05-08 sub-finding (#192): the clone step ALSO fails when
      # any referenced workspace-template repo is private and the
      # AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read
      # access. Root cause: 5 of 9 workspace-template repos
      # (openclaw, codex, crewai, deepagents, gemini-cli) had been
      # marked private with no team grant. Resolution: flipped them
      # to public per `feedback_oss_first_repo_visibility_default`
      # (the OSS surface should be public). Layer-3 (customer-private +
      # marketplace third-party repos) tracked separately in
      # internal#102.
      #
      # Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
      # is the devops-engineer persona PAT, NOT the founder PAT (per
      # `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
      # embeds it as basic-auth for the duration of the clones and strips
      # .git directories — the token never enters the resulting image.
      - name: Pre-clone manifest deps
        if: needs.detect-changes.outputs.run == 'true'
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
            echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
            exit 1
          fi
          mkdir -p .tenant-bundle-deps
          bash scripts/clone-manifest.sh \
            manifest.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          # Sanity-check counts so a silent partial clone fails fast
          # instead of producing a half-empty image.
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
      - name: Install Python deps for replays
        # peer-discovery-404 (and future replays) eval Python against the
        # running tenant — importing workspace/a2a_client.py pulls in
--- a/.github/workflows/pr-guards.yml
+++ b/.github/workflows/pr-guards.yml
@ -0,0 +1,22 @@
 name: pr-guards
 # Thin caller that delegates to the molecule-ci reusable guard. Today
 # the guard is just "disable auto-merge when a new commit is pushed
 # after auto-merge was enabled" — added 2026-04-27 after PR #2174
 # auto-merged with only its first commit because the second commit
 # was pushed after the merge queue had locked the PR's SHA.
 #
 # When more PR-time guards land in molecule-ci, add them here as
 # additional jobs that share the same pull_request:synchronize
 # trigger.
 on:
  pull_request:
    types: [synchronize]
 permissions:
  pull-requests: write
 jobs:
  disable-auto-merge-on-push:
    uses: molecule-ai/molecule-ci/.github/workflows/disable-auto-merge-on-push.yml@main
--- a/.github/workflows/promote-latest.yml
+++ b/.github/workflows/promote-latest.yml
@ -0,0 +1,85 @@
 name: promote-latest
 # Manually retag ghcr.io/molecule-ai/platform:staging-<sha> →  :latest
 # (and the same for the tenant image). Use this to:
 #
 #   1. Promote a :staging-<sha> to prod before the canary fleet is live
 #      (one-off during the initial rollout).
 #   2. Roll back :latest to a prior known-good digest after a bad
 #      promotion slipped past canary (use scripts/rollback-latest.sh
 #      for a local / emergency path; this workflow is for scheduled
 #      or from-browser promotions).
 #
 # Running this workflow needs no extra secrets — GitHub's default
 # GITHUB_TOKEN has write:packages for repo-owned GHCR images, which
 # is all we need for a remote retag via `crane tag`.
 on:
  workflow_dispatch:
    inputs:
      sha:
        description: 'Short sha to promote (e.g. 4c1d56e). Must match an existing :staging-<sha> tag.'
        required: true
        type: string
 permissions:
  contents: read
  packages: write
 env:
  IMAGE_NAME: ghcr.io/molecule-ai/platform
  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
 jobs:
  promote:
    runs-on: ubuntu-latest
    steps:
      - uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
      - name: GHCR login
        run: |
          echo "${{ secrets.GITHUB_TOKEN }}" \
            | crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
      - name: Retag platform image
        run: |
          set -eu
          SRC="${IMAGE_NAME}:staging-${{ inputs.sha }}"
          if ! crane digest "$SRC" >/dev/null 2>&1; then
            echo "::error::$SRC not found in registry — double-check the sha."
            exit 1
          fi
          EXPECTED=$(crane digest "$SRC")
          crane tag "$SRC" latest
          ACTUAL=$(crane digest "${IMAGE_NAME}:latest")
          if [ "$ACTUAL" != "$EXPECTED" ]; then
            echo "::error::retag digest mismatch (expected $EXPECTED, got $ACTUAL)"
            exit 1
          fi
          echo "OK  ${IMAGE_NAME}:latest → $ACTUAL"
      - name: Retag tenant image
        run: |
          set -eu
          SRC="${TENANT_IMAGE_NAME}:staging-${{ inputs.sha }}"
          if ! crane digest "$SRC" >/dev/null 2>&1; then
            echo "::error::$SRC not found — tenant image may not have built for this sha."
            exit 1
          fi
          EXPECTED=$(crane digest "$SRC")
          crane tag "$SRC" latest
          ACTUAL=$(crane digest "${TENANT_IMAGE_NAME}:latest")
          if [ "$ACTUAL" != "$EXPECTED" ]; then
            echo "::error::tenant retag digest mismatch"
            exit 1
          fi
          echo "OK  ${TENANT_IMAGE_NAME}:latest → $ACTUAL"
      - name: Summary
        run: |
          {
            echo "## :latest promoted to staging-${{ inputs.sha }}"
            echo
            echo "Both platform + tenant images retagged. Prod tenants"
            echo "will auto-pull within their 5-min update cycle."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/publish-canvas-image.yml
+++ b/.github/workflows/publish-canvas-image.yml
@ -54,22 +54,6 @@ jobs:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
      # Health check: verify Docker daemon is accessible before attempting any
      # build steps. This fails loudly at step 1 when the runner's docker.sock
      # is inaccessible rather than silently continuing to the build step
      # where docker build fails deep in ECR auth with a cryptic error.
      - name: Verify Docker daemon access
        run: |
          set -euo pipefail
          echo "::group::Docker daemon health check"
          docker info 2>&1 | head -5 || {
            echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
            echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
            exit 1
          }
          echo "Docker daemon OK"
          echo "::endgroup::"
      - name: Compute tags
        id: tags
        shell: bash
--- a/.github/workflows/publish-runtime.yml
+++ b/.github/workflows/publish-runtime.yml
@ -0,0 +1,363 @@
 name: publish-runtime
 # Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/.
 # Monorepo workspace/ is the only source-of-truth for runtime code; this
 # workflow is the bridge from monorepo edits to the PyPI artifact that
 # the 8 workspace-template-* repos depend on.
 #
 # Triggered by:
 #   - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from
 #     the tag — `runtime-v0.1.6` publishes `0.1.6`).
 #   - Manual workflow_dispatch with an explicit `version` input (useful for
 #     dev/test releases without tagging the repo).
 #   - Auto: any push to `staging` that touches `workspace/**`. The version
 #     is derived by querying PyPI for the current latest and bumping the
 #     patch component. This closes the human-in-loop gap that caused the
 #     2026-04-27 RuntimeCapabilities ImportError outage — adapter symbol
 #     additions in workspace/adapters/base.py used to require an operator
 #     to remember to publish; now the merge itself triggers the publish.
 #
 # The workflow:
 #   1. Runs scripts/build_runtime_package.py to copy workspace/ →
 #      build/molecule_runtime/ with imports rewritten (`a2a_client` →
 #      `molecule_runtime.a2a_client`).
 #   2. Builds wheel + sdist with `python -m build`.
 #   3. Publishes to PyPI via the PyPA Trusted Publisher action (OIDC).
 #      No static API token is stored — PyPI verifies the workflow's
 #      OIDC claim against the trusted-publisher config registered for
 #      molecule-ai-workspace-runtime (molecule-ai/molecule-core,
 #      publish-runtime.yml, environment pypi-publish).
 #
 # After publish: the 8 template repos pick up the new version on their
 # next image rebuild (their requirements.txt pin
 # `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible).
 # To force-pull immediately, bump the pin in each template repo's
 # requirements.txt and merge — that triggers their own publish-image.yml.
 on:
  push:
    tags:
      - "runtime-v*"
    branches:
      - staging
    paths:
      # Auto-publish when staging gets changes that affect what gets
      # published. Path filter ONLY applies to branch pushes — tag pushes
      # still fire regardless.
      #
      # workspace/** is the source-of-truth for runtime code.
      # scripts/build_runtime_package.py is the build script — changes to
      # it (e.g. a fix to the import rewriter or a manifest emit) directly
      # affect what ships in the wheel even if no workspace/ file changes.
      # The 2026-04-27 lib/ subpackage incident missed an auto-publish for
      # exactly this reason — PR #2174 only changed scripts/ and the
      # operator had to remember a manual dispatch.
      - "workspace/**"
      - "scripts/build_runtime_package.py"
  workflow_dispatch:
    inputs:
      version:
        description: "Version to publish (e.g. 0.1.6). Required for manual dispatch."
        required: true
        type: string
 permissions:
  contents: read
 # Serialize publishes so two staging merges landing seconds apart don't
 # both compute "latest+1" and race on PyPI upload. The second one waits.
 concurrency:
  group: publish-runtime
  cancel-in-progress: false
 jobs:
  publish:
    runs-on: ubuntu-latest
    environment: pypi-publish
    permissions:
      contents: read
      id-token: write   # PyPI Trusted Publisher (OIDC) — no PYPI_TOKEN needed
    outputs:
      version: ${{ steps.version.outputs.version }}
      wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
          cache: pip
      - name: Derive version (tag, manual input, or PyPI auto-bump)
        id: version
        run: |
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            VERSION="${{ inputs.version }}"
          elif echo "$GITHUB_REF_NAME" | grep -q "^runtime-v"; then
            # Tag is `runtime-vX.Y.Z` — strip the prefix.
            VERSION="${GITHUB_REF_NAME#runtime-v}"
          else
            # Auto-publish from staging push. Query PyPI for the current
            # latest and bump the patch component. concurrency: group above
            # serializes parallel staging merges so we don't race on the
            # bump. If PyPI is unreachable, fail loud — better to skip a
            # publish than to overwrite an existing version.
            LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
              | python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
            MAJOR=$(echo "$LATEST" | cut -d. -f1)
            MINOR=$(echo "$LATEST" | cut -d. -f2)
            PATCH=$(echo "$LATEST" | cut -d. -f3)
            VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
            echo "Auto-bumped from PyPI latest $LATEST -> $VERSION"
          fi
          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
            echo "::error::version $VERSION does not match PEP 440"
            exit 1
          fi
          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
          echo "Publishing molecule-ai-workspace-runtime $VERSION"
      - name: Install build tooling
        run: pip install build twine
      - name: Build package from workspace/
        run: |
          python scripts/build_runtime_package.py \
            --version "${{ steps.version.outputs.version }}" \
            --out "${{ runner.temp }}/runtime-build"
      - name: Build wheel + sdist
        working-directory: ${{ runner.temp }}/runtime-build
        run: python -m build
      - name: Capture wheel SHA256 for cascade content-verification
        # Recorded BEFORE upload so the cascade probe can verify the
        # bytes Fastly serves under the new version's URL match what
        # we built. Closes a hole left by #2197: that probe verified
        # pip can resolve the version (catches propagation lag) but
        # not that the wheel content matches (would silently pass a
        # Fastly stale-content scenario where the new version's URL
        # serves an old wheel binary).
        id: wheel_hash
        working-directory: ${{ runner.temp }}/runtime-build
        run: |
          set -eu
          WHEEL=$(ls dist/*.whl 2>/dev/null | head -1)
          if [ -z "$WHEEL" ]; then
            echo "::error::No .whl in dist/ — `python -m build` must have failed silently"
            exit 1
          fi
          HASH=$(sha256sum "$WHEEL" | awk '{print $1}')
          echo "wheel_sha256=${HASH}" >> "$GITHUB_OUTPUT"
          echo "Local wheel SHA256 (pre-upload): ${HASH}"
          echo "Wheel filename: $(basename "$WHEEL")"
      - name: Verify package contents (sanity)
        working-directory: ${{ runner.temp }}/runtime-build
        # Smoke logic lives in scripts/wheel_smoke.py so the same gate runs
        # at both PR-time (runtime-prbuild-compat.yml) and publish-time
        # (here). Splitting the smoke across two heredocs let them drift
        # apart historically — one script keeps them locked.
        run: |
          python -m twine check dist/*
          python -m venv /tmp/smoke
          /tmp/smoke/bin/pip install --quiet dist/*.whl
          /tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
      - name: Publish to PyPI (Trusted Publisher / OIDC)
        # PyPI side is configured: project molecule-ai-workspace-runtime →
        # publisher molecule-ai/molecule-core, workflow publish-runtime.yml,
        # environment pypi-publish. The action mints a short-lived OIDC
        # token and exchanges it for a PyPI upload credential — no static
        # API token in this repo's secrets.
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          packages-dir: ${{ runner.temp }}/runtime-build/dist/
  cascade:
    # After PyPI accepts the upload, fan out a repository_dispatch to each
    # template repo so they rebuild their image against the new runtime.
    # Each template's `runtime-published.yml` receiver picks up the event,
    # pulls the new PyPI version (their requirements.txt pin is `>=`), and
    # republishes ghcr.io/molecule-ai/workspace-template-<runtime>:latest.
    #
    # Soft-fail per repo: if one template's dispatch fails (perms missing,
    # repo archived, etc.) we still try the others and surface the failures
    # in the workflow summary instead of aborting the whole cascade.
    needs: publish
    runs-on: ubuntu-latest
    steps:
      - name: Wait for PyPI to propagate the new version
        # PyPI accepts the upload, then takes a few seconds to make the
        # new version visible across all THREE surfaces pip touches:
        #   1. /pypi/<pkg>/<ver>/json — metadata endpoint
        #   2. /simple/<pkg>/         — pip's primary download index
        #   3. files.pythonhosted.org — CDN-fronted wheel binary
        # Each has its own cache. The previous check polled only (1)
        # and would let the cascade fire while (2) or (3) still served
        # the previous version, so downstream `pip install` resolved
        # to the old wheel. Docker layer cache then locked that stale
        # resolution in for subsequent rebuilds (the cache trap that
        # bit us five times in one night).
        #
        # Two-stage probe per poll:
        #   (a) `pip install --no-cache-dir PACKAGE==VERSION` — succeeds
        #       only when the version is resolvable. Catches surface (1)
        #       and (2) propagation lag.
        #   (b) `pip download` of the same wheel + SHA256 compare against
        #       the just-built dist's hash. Catches surface (3) lag AND
        #       Fastly serving stale content under the new version's URL
        #       (a separate Fastly-corruption mode that pip-install alone
        #       can't see, since pip install resolves+unpacks against
        #       whatever bytes Fastly returns and never inspects them).
        # Both must pass before the cascade fans out.
        #
        # The venv is reused across polls; only `pip install`/`pip
        # download` run in the loop, with --force-reinstall +
        # --no-cache-dir so the previous poll's cached state doesn't
        # mask propagation lag.
        env:
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
          EXPECTED_SHA256: ${{ needs.publish.outputs.wheel_sha256 }}
        run: |
          set -eu
          if [ -z "$EXPECTED_SHA256" ]; then
            echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
            exit 1
          fi
          python -m venv /tmp/propagation-probe
          PROBE=/tmp/propagation-probe/bin
          $PROBE/pip install --upgrade --quiet pip
          # Poll budget: 30 attempts × (~3-5s pip install + ~3s pip
          # download + 4s sleep) ≈ 5-6 min wall on a slow GH runner.
          # Generous vs PyPI's typical few-seconds propagation;
          # failures past this are signal of a real PyPI / Fastly
          # issue, not just lag.
          for i in $(seq 1 30); do
            # Stage (a): can pip resolve and install the version?
            if $PROBE/pip install \
                  --quiet \
                  --no-cache-dir \
                  --force-reinstall \
                  --no-deps \
                  "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
                  >/dev/null 2>&1; then
              INSTALLED=$($PROBE/pip show molecule-ai-workspace-runtime 2>/dev/null \
                          | awk -F': ' '/^Version:/{print $2}')
              if [ "$INSTALLED" = "$RUNTIME_VERSION" ]; then
                # Stage (b): does Fastly serve the bytes we uploaded?
                # `pip download` writes the actual .whl file to disk so
                # we can sha256sum it (vs `pip install` which unpacks
                # and discards).
                rm -rf /tmp/probe-dl
                mkdir -p /tmp/probe-dl
                if $PROBE/pip download \
                      --quiet \
                      --no-cache-dir \
                      --no-deps \
                      --dest /tmp/probe-dl \
                      "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
                      >/dev/null 2>&1; then
                  WHEEL=$(ls /tmp/probe-dl/*.whl 2>/dev/null | head -1)
                  if [ -n "$WHEEL" ]; then
                    ACTUAL=$(sha256sum "$WHEEL" | awk '{print $1}')
                    if [ "$ACTUAL" = "$EXPECTED_SHA256" ]; then
                      echo "::notice::✓ pip resolves AND wheel content matches after ${i} poll(s) (sha256=${EXPECTED_SHA256})"
                      exit 0
                    fi
                    # Hash mismatch: PyPI accepted our upload but Fastly
                    # is serving different bytes under the version's URL.
                    # Most often this is propagation lag of the BINARY
                    # surface — the version is resolvable but the wheel
                    # cache hasn't caught up. Retry.
                    echo "::warning::poll ${i}: wheel content mismatch (got ${ACTUAL:0:12}…, want ${EXPECTED_SHA256:0:12}…) — Fastly likely still serving stale binary, retrying"
                  fi
                fi
              fi
            fi
            sleep 4
          done
          echo "::error::pip never resolved molecule-ai-workspace-runtime==${RUNTIME_VERSION} with matching wheel content within ~5 min."
          echo "::error::Expected wheel SHA256: ${EXPECTED_SHA256}"
          echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces."
          exit 1
      - name: Fan out repository_dispatch
        env:
          # Fine-grained PAT with `actions:write` on the 8 template repos.
          # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
          # token. Stored as a repo secret; rotate per the standard schedule.
          DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
          # Single source of truth: the publish job's output, which handles
          # tag/manual-input/auto-bump uniformly. The previous fallback
          # (`steps.version.outputs.version` from inside the cascade job)
          # was a dead reference — different job, no shared step scope.
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
        run: |
          set +e   # don't abort on a single repo failure — collect them all
          # Schedule-vs-dispatch behaviour split (hardened 2026-04-28
          # after the sweep-cf-orphans soft-skip incident — same class
          # of bug):
          #
          # The earlier "skipping cascade. templates will pick up the
          # new version on their own next rebuild" message was wrong —
          # templates only build on this dispatch trigger; without it
          # they stay pinned to whatever runtime version they last saw.
          # A silent skip here means "PyPI is current, templates are
          # not" and the gap is invisible until someone notices a
          # template still on the old version weeks later.
          #
          #   - push                → exit 1 (red CI surfaces the gap)
          #   - workflow_dispatch   → exit 0 with a warning (operator
          #                           ran this ad-hoc; let them rerun
          #                           after fixing the secret)
          if [ -z "$DISPATCH_TOKEN" ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade."
              echo "::warning::set it at Settings → Secrets and Variables → Actions, then rerun. Templates will stay on the prior runtime version until either this token is set or each template is rebuilt manually."
              exit 0
            fi
            echo "::error::TEMPLATE_DISPATCH_TOKEN secret missing — cascade cannot fan out."
            echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version until this token is restored and a republish dispatches the cascade."
            echo "::error::set it at Settings → Secrets and Variables → Actions; then re-trigger publish-runtime via workflow_dispatch."
            exit 1
          fi
          VERSION="$RUNTIME_VERSION"
          if [ -z "$VERSION" ]; then
            echo "::error::publish job did not expose a version output — cascade cannot fan out"
            exit 1
          fi
          # All 9 active workspace template repos. The PR #2536 pruning
          # ("deprecated, no shipping images") was empirically wrong:
          # continuous-synth-e2e.yml defaults to langgraph as its primary
          # canary (line 44), and every excluded template had successful
          # publish-image runs as of 2026-05-03 — none were dormant.
          # Symptom of the prune: today's a2a-sdk strict-mode fix
          # (#2566 / commit e1628c4) cascaded to 4 templates but never
          # reached langgraph, so the synth-E2E correctly canary'd a fix
          # that had landed but not deployed. Re-added the 5 templates.
          # Long-term: derive this list from manifest.json so cascade
          # scope can't drift from E2E scope — tracked in RFC #388 as a
          # Phase-1 invariant.
          TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
          FAILED=""
          for tpl in $TEMPLATES; do
            REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
            STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
              -X POST "https://api.github.com/repos/$REPO/dispatches" \
              -H "Authorization: Bearer $DISPATCH_TOKEN" \
              -H "Accept: application/vnd.github+json" \
              -H "X-GitHub-Api-Version: 2022-11-28" \
              -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
            if [ "$STATUS" = "204" ]; then
              echo "✓ dispatched $tpl ($VERSION)"
            else
              echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
              FAILED="$FAILED $tpl"
            fi
          done
          if [ -n "$FAILED" ]; then
            echo "::warning::Cascade incomplete. Failed templates:$FAILED"
            # Don't fail the whole job — PyPI publish already succeeded;
            # operators can retry the failed templates manually.
          fi
--- a/.github/workflows/publish-workspace-server-image.yml
+++ b/.github/workflows/publish-workspace-server-image.yml
@ -0,0 +1,185 @@
 name: publish-workspace-server-image
 # Builds and pushes Docker images to GHCR on staging or main pushes.
 # EC2 tenant instances pull the tenant image from GHCR.
 #
 # Branch / tag policy (see Compute tags step for the per-branch logic):
 #
 #   staging push  → builds image, tags :staging-<sha> + :staging-latest.
 #                   staging-CP pins TENANT_IMAGE=:staging-latest, so it
 #                   picks up staging-branch code automatically. This is
 #                   what makes staging-CP actually test staging-branch
 #                   code instead of "yesterday's main" — pre-fix, this
 #                   workflow only ran on main, so staging tenants
 #                   silently served stale code (#2308 fix RFC #2312
 #                   landed on staging but never reached tenants because
 #                   staging→main was wedged on path-filter parity bugs).
 #
 #   main push     → builds image, tags :staging-<sha> + :staging-latest
 #                   (same as before). canary-verify.yml retags
 #                   :staging-<sha> → :latest after canary tenants
 #                   green-light the digest. The :staging-latest retag
 #                   on main push is intentional: when main lands AFTER a
 #                   staging push, staging-CP gets the post-promote code
 #                   (which equals what it had + any merge resolution),
 #                   so the canary-on-staging-CP step still runs against
 #                   the prod-bound digest.
 #
 # In the steady state both branches refresh :staging-latest; the
 # semantic is "most recent staging-or-main build of tenant code."
 # Drift between the two is bounded by the staging→main auto-promote
 # cadence and is corrected on the next staging push.
 on:
  push:
    branches: [staging, main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
      - 'scripts/**'
      - '.github/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:
 # Serialize per-branch so two rapid staging pushes don't race the same
 # :staging-latest tag retag. Allow staging and main to run in parallel
 # (different github.ref → different concurrency group) since they
 # produce different :staging-<sha> tags and last-write-wins on
 # :staging-latest is acceptable across branches (the post-promote
 # main code equals current staging code in a healthy flow).
 #
 # cancel-in-progress: false → in-flight builds finish; the next push's
 # build queues. This avoids a partially-pushed image and keeps the
 # canary fleet pin (:staging-<sha>) consistent with what was actually
 # tested at canary-verify time.
 concurrency:
  group: publish-workspace-server-image-${{ github.ref }}
  cancel-in-progress: false
 permissions:
  contents: read
  packages: write
 env:
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # plugin was dropped + workspace-server/Dockerfile no longer
      # COPYs it.
      - name: Configure AWS credentials for ECR
        # GHCR was the pre-suspension target; the molecule-ai org on
        # GitHub got swept 2026-05-06 and ghcr.io/molecule-ai/* is no
        # longer reachable. Post-suspension target is the operator's
        # ECR org (153263036946.dkr.ecr.us-east-2.amazonaws.com/
        # molecule-ai/*), which already hosts platform-tenant +
        # workspace-template-* + runner-base images. AWS creds come
        # from the AWS_ACCESS_KEY_ID/SECRET secrets bound to the
        # molecule-cp IAM user. Closes #161.
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-east-2
      - name: Log in to ECR
        id: ecr-login
        uses: aws-actions/amazon-ecr-login@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      # Canary-gated release flow:
      #   - This step always publishes :staging-<sha> + :staging-latest.
      #   - On staging push, staging-CP picks up :staging-latest immediately
      #     (its TENANT_IMAGE pin is :staging-latest) — so staging-branch
      #     code reaches staging tenants without waiting for main.
      #   - On main push, canary-verify.yml runs smoke tests against
      #     canary tenants (which pin :staging-<sha>), and on green retags
      #     :staging-<sha> → :latest. Prod tenants pull :latest.
      #   - On red, :latest stays on the prior good digest — prod is safe.
      #
      # Why :staging-latest is retagged on main push too: when main lands
      # after a staging promote, staging-CP gets the post-promote code so
      # the canary-on-staging-CP step still runs against the prod-bound
      # digest. In a healthy flow the post-promote main code == the
      # current staging code, so this is effectively a no-op except for
      # the canary fleet pin handoff.
      #
      # Pre-fix history: this workflow used to only trigger on main. That
      # meant staging-CP served "yesterday's main" indefinitely whenever
      # staging→main was wedged. The 2026-04-30 dogfooding session
      # surfaced this when RFC #2312 (chat upload HTTP-forward) landed on
      # staging but staging tenants kept failing chat upload because they
      # were running pre-RFC code. Adding the staging trigger above closes
      # that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
      # drifted 10 days behind staging — same class of bug, different
      # mechanism.
      - name: Build & push platform image to GHCR (staging-<sha> + staging-latest)
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: ./workspace-server/Dockerfile
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
            ${{ env.IMAGE_NAME }}:staging-latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
          # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
          # This is the same value as the OCI revision label below; passing
          # it twice is intentional, the OCI label is for registry tooling
          # while /buildinfo is for the redeploy verification step.
          build-args: |
            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify
      - name: Build & push tenant image to GHCR (staging-<sha> + staging-latest)
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: ./workspace-server/Dockerfile.tenant
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
            ${{ env.TENANT_IMAGE_NAME }}:staging-latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # Canvas uses same-origin fetches. The tenant Go platform
          # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
          # env; the tenant's /canvas/viewport, /approvals/pending,
          # /org/templates etc. live on the tenant platform itself.
          # Both legs share one origin (the tenant subdomain) so
          # PLATFORM_URL="" forces canvas to fetch paths as relative,
          # which land same-origin.
          #
          # Self-hosted / private-label deployments override this at
          # build time with a specific backend (e.g. local dev:
          # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
          build-args: |
            NEXT_PUBLIC_PLATFORM_URL=
            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify
--- a/.github/workflows/redeploy-tenants-on-main.yml
+++ b/.github/workflows/redeploy-tenants-on-main.yml
@ -3,9 +3,9 @@ name: redeploy-tenants-on-main
 # Auto-refresh prod tenant EC2s after every main merge.
 #
 # Why this workflow exists: publish-workspace-server-image builds and
-# pushes a new platform-tenant :<sha> to ECR on every merge to main,
+# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
-# but running tenants pulled their image once at boot and never re-pull.
+# to main, but running tenants pulled their image once at boot and
-# Users see stale code indefinitely.
+# never re-pull. Users see stale code indefinitely.
 #
 # This workflow closes the gap by calling the control-plane admin
 # endpoint that performs a canary-first, batched, health-gated rolling
@ -13,18 +13,12 @@ name: redeploy-tenants-on-main
 # molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
 # (feat/tenant-auto-redeploy, landing alongside this workflow).
 #
 # Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/
 # molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the
 # Gitea suspension migration. The canary-verify.yml promote step now
 # uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap).
 #
 # Runtime ordering:
-#   1. publish-workspace-server-image completes → new :staging-<sha> in ECR.
+#   1. publish-workspace-server-image completes → new :latest in GHCR.
-#   2. This workflow fires via workflow_run, calls redeploy-fleet with
+#   2. This workflow fires via workflow_run, waits 30s for GHCR's
-#      target_tag=staging-<sha>. No CDN propagation wait needed —
+#      CDN to propagate the new tag to the region the tenants pull from.
-#      ECR image manifest is consistent immediately after push.
+#   3. Calls redeploy-fleet with canary_slug=hongming and a 60s
-#   3. Calls redeploy-fleet with canary_slug (if set) and a soak
+#      soak. Canary proves the image boots; batches follow.
 #      period. Canary proves the image boots; batches follow.
 #   4. Any failure aborts the rollout and leaves older tenants on the
 #      prior image — safer default than half-and-half state.
 #
@ -114,11 +108,13 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 25
    steps:
-      - name: Note on ECR propagation
+      - name: Wait for GHCR tag propagation
-        # ECR image manifests are consistent immediately after push — no
+        # GHCR's edge cache takes ~15-30s to consistently serve the new
-        # CDN cache to wait for. The old GHCR-based workflow had a 30s
+        # manifest after the registry accepts the push. Without this
-        # sleep to avoid race conditions; ECR makes that unnecessary.
+        # sleep, the first tenant's docker pull sometimes races and
-        run: echo "ECR image available immediately after push — proceeding."
+        # fetches the previous digest; sleeping is the cheapest way to
        # reduce that without polling GHCR for the new digest.
        run: sleep 30
      - name: Compute target tag
        id: tag
--- a/.github/workflows/redeploy-tenants-on-staging.yml
+++ b/.github/workflows/redeploy-tenants-on-staging.yml
@ -36,7 +36,7 @@ on:
  workflow_run:
    workflows: ['publish-workspace-server-image']
    types: [completed]
-    branches: [main]
+    branches: [staging]
  workflow_dispatch:
    inputs:
      target_tag:
--- a/.github/workflows/retarget-main-to-staging.yml
+++ b/.github/workflows/retarget-main-to-staging.yml
@ -0,0 +1,105 @@
 name: Retarget main PRs to staging
 # Mechanical enforcement of SHARED_RULES rule 8 ("Staging-first workflow, no
 # exceptions"). When a bot opens a PR against main, retarget it to staging
 # automatically and leave an explanatory comment. Human CEO-authored PRs (the
 # staging→main promotion PR, etc.) are left alone — they're the authorised
 # exception to the rule.
 #
 # Why an Action instead of only a prompt rule: prompt rules depend on every
 # role's system-prompt.md staying in sync. Today 5 of 8 engineer roles
 # (core-be, core-fe, app-fe, app-qa, devops-engineer) don't have the
 # staging-first section — the bot keeps opening PRs to main. An Action
 # enforces the invariant regardless of prompt drift.
 on:
  pull_request_target:
    types: [opened, reopened]
    branches: [main]
 permissions:
  pull-requests: write
 jobs:
  retarget:
    name: Retarget to staging
    runs-on: ubuntu-latest
    # Only fire for bot-authored PRs. Human CEO PRs (staging→main promotion)
    # are intentional and pass through.
    #
    # Head-ref guard: never retarget a PR whose head IS `staging` — those
    # are the auto-promote staging→main PRs (opened by molecule-ai[bot]
    # since #2586 switched to an App token, which now passes the bot
    # filter below). Retargeting head=staging onto base=staging fails
    # with HTTP 422 "no new commits between base 'staging' and head
    # 'staging'", which used to surface as a noisy red workflow run on
    # every auto-promote (caught 2026-05-03 on PR #2588).
    if: >-
      github.event.pull_request.head.ref != 'staging'
      && (
        github.event.pull_request.user.type == 'Bot'
        || endsWith(github.event.pull_request.user.login, '[bot]')
        || github.event.pull_request.user.login == 'app/molecule-ai'
        || github.event.pull_request.user.login == 'molecule-ai[bot]'
      )
    steps:
      - name: Retarget PR base to staging
        id: retarget
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
        # Issue #1884: when the bot opens a PR against main and there's
        # already another PR on the same head branch targeting staging,
        # GitHub's PATCH /pulls returns 422 with
        # "A pull request already exists for base branch 'staging' …".
        # The retarget can't proceed — but the right response is to
        # close the now-redundant main-PR, not to fail the workflow
        # noisily. Detect that specific 422 and close instead.
        run: |
          set +e
          echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging"
          PATCH_OUTPUT=$(gh api -X PATCH \
            "repos/${{ github.repository }}/pulls/${PR_NUMBER}" \
            -f base=staging \
            --jq '.base.ref' 2>&1)
          PATCH_EXIT=$?
          set -e
          if [ "$PATCH_EXIT" -eq 0 ]; then
            echo "::notice::Retargeted PR #${PR_NUMBER} → staging"
            echo "outcome=retargeted" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Specifically match the 422 duplicate-base/head error so
          # any OTHER PATCH failure (auth, deleted PR, etc.) still
          # surfaces as a real workflow failure.
          if echo "$PATCH_OUTPUT" | grep -q "pull request already exists for base branch 'staging'"; then
            echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant."
            gh pr close "$PR_NUMBER" \
              --repo "${{ github.repository }}" \
              --comment "[retarget-bot] Closing — another PR on the same head branch already targets \`staging\`. This PR is redundant. See issue #1884 for the rationale."
            echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          echo "::error::Retarget PATCH failed and was NOT a duplicate-base error:"
          echo "$PATCH_OUTPUT" >&2
          exit 1
      - name: Post explainer comment
        if: steps.retarget.outputs.outcome == 'retargeted'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          gh pr comment "$PR_NUMBER" \
            --repo "${{ github.repository }}" \
            --body "$(cat <<'BODY'
          [retarget-bot] This PR was opened against `main` and has been retargeted to `staging` automatically.
          **Why:** per [SHARED_RULES rule 8](https://github.com/molecule-ai/molecule-ai-org-template-molecule-dev/blob/main/SHARED_RULES.md), all feature work targets `staging` first; the CEO promotes `staging → main` separately.
          **What changed:** just the base branch — no code change. CI will re-run against `staging`. If you get merge conflicts, rebase on `staging`.
          **If this PR is the CEO's staging→main promotion:** the Action skipped you (only bot-authored PRs are retargeted). If you see this comment on your CEO PR, that's a bug — please tag @HongmingWang-Rabbit.
          BODY
          )"
--- a/.github/workflows/secret-pattern-drift.yml
+++ b/.github/workflows/secret-pattern-drift.yml
@ -48,7 +48,7 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@v6
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
--- a/.github/workflows/secret-scan.yml
+++ b/.github/workflows/secret-scan.yml
@ -7,24 +7,33 @@ name: Secret scan
 # slurping the URL from a token-embedded origin remote. We can't fix
 # upstream's clone hygiene, so we gate here.
 #
 # Also the canonical reusable workflow for the rest of the org. Other
 # Molecule-AI repos enroll with a single 3-line workflow:
 #
 #   jobs:
 #     secret-scan:
 #       uses: molecule-ai/molecule-core/.github/workflows/secret-scan.yml@staging
 #
 # Pin to @staging not @main — staging is the active default branch,
 # main lags via the staging-promotion workflow. Updates ride along
 # automatically on the next consumer workflow run.
 #
 # Same regex set as the runtime's bundled pre-commit hook
 # (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh).
 # Keep the two sides aligned when adding patterns.
 #
 # Ported from .github/workflows/secret-scan.yml so the gate actually
 # fires on Gitea Actions. Differences from the GitHub version:
 #   - drops `merge_group` event (Gitea has no merge queue)
 #   - drops `workflow_call` (no cross-repo reusable invocation on Gitea)
 #   - SELF path updated to .gitea/workflows/secret-scan.yml
 # The job name + step name are identical to the GitHub workflow so the
 # status-check context (`Secret scan / Scan diff for credential-shaped
 # strings (pull_request)`) matches branch protection on molecule-core/main.
 on:
  pull_request:
    types: [opened, synchronize, reopened]
  push:
    branches: [main, staging]
  # Required for GitHub merge queue: the queue's pre-merge CI run on
  # `gh-readonly-queue/...` refs needs this check to fire so the queue
  # gets a real result instead of stalling forever AWAITING_CHECKS.
  merge_group:
    types: [checks_requested]
  # Reusable workflow entry point for other Molecule-AI repos.
  workflow_call:
 jobs:
  scan:
@ -41,14 +50,27 @@ jobs:
        if: github.event_name == 'pull_request'
        run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
      # For merge_group events the queue's pre-merge ref is a commit on
      # `gh-readonly-queue/...` whose parent is the queue's base_sha.
      # That parent isn't part of the queue branch's shallow clone, so
      # we fetch it explicitly. Without this the diff falls through to
      # "no BASE → scan entire tree" mode and false-positives on legit
      # test fixtures (e.g. canvas/src/lib/validation/__tests__/secret-formats.test.ts).
      - name: Fetch merge_group base SHA (merge_group events only)
        if: github.event_name == 'merge_group'
        run: git fetch --depth=1 origin ${{ github.event.merge_group.base_sha }}
      - name: Refuse if credential-shaped strings appear in diff additions
        env:
          # Plumb event-specific SHAs through env so the script doesn't
          # need conditional `${{ ... }}` interpolation per event type.
          # github.event.before/after only exist on push events;
-          # pull_request has pull_request.base.sha / pull_request.head.sha.
+          # merge_group has its own base_sha/head_sha; pull_request has
          # pull_request.base.sha / pull_request.head.sha.
          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
          MG_BASE_SHA: ${{ github.event.merge_group.base_sha }}
          MG_HEAD_SHA: ${{ github.event.merge_group.head_sha }}
          PUSH_BEFORE: ${{ github.event.before }}
          PUSH_AFTER: ${{ github.event.after }}
        run: |
@ -80,6 +102,10 @@ jobs:
              BASE="$PR_BASE_SHA"
              HEAD="$PR_HEAD_SHA"
              ;;
            merge_group)
              BASE="$MG_BASE_SHA"
              HEAD="$MG_HEAD_SHA"
              ;;
            *)
              BASE="$PUSH_BEFORE"
              HEAD="$PUSH_AFTER"
@ -118,10 +144,8 @@ jobs:
          # Self-exclude: this workflow file legitimately contains the
          # pattern strings as regex literals. Without an exclude it would
-          # block its own merge. Both the .github/ original and this
+          # block its own merge.
-          # .gitea/ port are excluded so a sync between them stays clean.
+          SELF=".github/workflows/secret-scan.yml"
          SELF_GITHUB=".github/workflows/secret-scan.yml"
          SELF_GITEA=".gitea/workflows/secret-scan.yml"
          OFFENDING=""
          # `while IFS= read -r` (not `for f in $CHANGED`) so filenames
@ -131,8 +155,7 @@ jobs:
          # self-exclude + diff lookup.
          while IFS= read -r f; do
            [ -z "$f" ] && continue
-            [ "$f" = "$SELF_GITHUB" ] && continue
+            [ "$f" = "$SELF" ] && continue
            [ "$f" = "$SELF_GITEA" ] && continue
            if [ -n "$DIFF_RANGE" ]; then
              ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
            else
--- a/.gitignore
+++ b/.gitignore
@ -131,13 +131,6 @@ backups/
 # Cloned by publish-workspace-server-image.yml so the Dockerfile's
 # replace-directive path resolves. Lives in its own repo.
 /molecule-ai-plugin-github-app-auth/
 # Tenant-image build context — populated by the workflow's
 # "Pre-clone manifest deps" step. Mirrors the public manifest, holds the
 # same content as the three /<>/ dirs above but namespaced under one
 # parent so the Docker build context is a single COPY-friendly tree.
 # Each entry is a transient working-dir, never source-of-truth, never
 # committed.
 /.tenant-bundle-deps/
 # Internal-flavored content lives in Molecule-AI/internal — NEVER in this
 # public monorepo. Migrated 2026-04-23 (CEO directive). The CI workflow
--- a/Show More
+++ b/Show More