fix(runbooks): correct Gitea runner fetch timing facts (post-#457) (#478 )

Co-authored-by: Molecule AI Infra-SRE <infra-sre@agents.moleculesai.app> Co-committed-by: Molecule AI Infra-SRE <infra-sre@agents.moleculesai.app>
Merge pull request 'tools: gate-check-v3 MVP — automated SOP-6 + CI gate detector' (#393 ) from tools/gate-check-v3 into main
2026-05-11 13:45:42 +00:00 · 2026-05-11 13:41:08 +00:00 · 2026-05-11 13:38:02 +00:00 · 2026-05-11 13:38:02 +00:00 · 2026-05-11 13:38:02 +00:00 · 2026-05-11 13:38:02 +00:00
337 changed files with 30948 additions and 4794 deletions
--- a/.gitea/scripts/audit-force-merge.sh
+++ b/.gitea/scripts/audit-force-merge.sh
@ -0,0 +1,118 @@
 #!/usr/bin/env bash
 # audit-force-merge — detect a §SOP-6 force-merge after PR close, emit
 # `incident.force_merge` to stdout as structured JSON.
 #
 # Vector's docker_logs source picks up runner stdout; the JSON gets
 # shipped to Loki on molecule-canonical-obs, indexable by event_type.
 # Query example:
 #
 #   {host="operator"} |= "event_type" |= "incident.force_merge" | json
 #
 # A force-merge is detected when a PR closed-with-merged=true had at
 # least one of the repo's required-status-check contexts in a state
 # other than "success" at the merge commit's SHA. That's exactly what
 # the Gitea force_merge:true API call lets through, so it's a faithful
 # detector of the override path.
 #
 # Triggers on `pull_request_target: closed` (loaded from base branch
 # per §SOP-6 security model). No-op when merged=false.
 #
 # Required env (set by the workflow):
 #   GITEA_TOKEN, GITEA_HOST, REPO, PR_NUMBER, REQUIRED_CHECKS
 #
 # REQUIRED_CHECKS is a newline-separated list of status-check context
 # names that branch protection requires. Declared in the workflow YAML
 # rather than fetched from /branch_protections (which needs admin
 # scope — sop-tier-bot has read-only). Trade dynamism for simplicity:
 # when the required-check set changes, update both branch protection
 # AND this env. Keeping them in sync is less complexity than granting
 # the audit bot admin perms on every repo.
 set -euo pipefail
 : "${GITEA_TOKEN:?required}"
 : "${GITEA_HOST:?required}"
 : "${REPO:?required}"
 : "${PR_NUMBER:?required}"
 : "${REQUIRED_CHECKS:?required (newline-separated context names)}"
 OWNER="${REPO%%/*}"
 NAME="${REPO##*/}"
 API="https://${GITEA_HOST}/api/v1"
 AUTH="Authorization: token ${GITEA_TOKEN}"
 # 1. Fetch the PR. If not merged, no-op.
 PR=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
 MERGED=$(echo "$PR" | jq -r '.merged // false')
 if [ "$MERGED" != "true" ]; then
  echo "::notice::PR #${PR_NUMBER} closed without merge — no audit emission."
  exit 0
 fi
 MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty')
 MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"')
 TITLE=$(echo "$PR" | jq -r '.title // ""')
 BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"')
 HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty')
 if [ -z "$MERGE_SHA" ]; then
  echo "::warning::PR #${PR_NUMBER} merged=true but no merge_commit_sha — cannot evaluate force-merge."
  exit 0
 fi
 # 2. Required status checks declared in the workflow env.
 REQUIRED="$REQUIRED_CHECKS"
 if [ -z "${REQUIRED//[[:space:]]/}" ]; then
  echo "::notice::REQUIRED_CHECKS empty — force-merge not applicable."
  exit 0
 fi
 # 3. Status-check state at the PR HEAD (where checks ran). The merge
 #    commit doesn't get its own checks; we evaluate the PR's last
 #    commit, which is what branch protection compared against.
 STATUS=$(curl -sS -H "$AUTH" \
  "${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
 declare -A CHECK_STATE
 while IFS=$'\t' read -r ctx state; do
  [ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
 done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"')
 # 4. For each required check, was it green at merge? YAML block scalars
 #    (`|`) leave a trailing newline; skip blank/whitespace-only lines.
 FAILED_CHECKS=()
 while IFS= read -r req; do
  trimmed="${req#"${req%%[![:space:]]*}"}"   # ltrim
  trimmed="${trimmed%"${trimmed##*[![:space:]]}"}"  # rtrim
  [ -z "$trimmed" ] && continue
  state="${CHECK_STATE[$trimmed]:-missing}"
  if [ "$state" != "success" ]; then
    FAILED_CHECKS+=("${trimmed}=${state}")
  fi
 done <<< "$REQUIRED"
 if [ "${#FAILED_CHECKS[@]}" -eq 0 ]; then
  echo "::notice::PR #${PR_NUMBER} merged with all required checks green — not a force-merge."
  exit 0
 fi
 # 5. Emit structured audit event.
 NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
 FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .)
 # Print as a single-line JSON so Vector's parse_json transform can pick
 # it up cleanly from docker_logs.
 jq -nc \
  --arg event_type "incident.force_merge" \
  --arg ts "$NOW" \
  --arg repo "$REPO" \
  --argjson pr "$PR_NUMBER" \
  --arg title "$TITLE" \
  --arg base "$BASE_BRANCH" \
  --arg merged_by "$MERGED_BY" \
  --arg merge_sha "$MERGE_SHA" \
  --argjson failed_checks "$FAILED_JSON" \
  '{event_type: $event_type, ts: $ts, repo: $repo, pr: $pr, title: $title,
    base_branch: $base, merged_by: $merged_by, merge_sha: $merge_sha,
    failed_checks: $failed_checks}'
 echo "::warning::FORCE-MERGE detected on PR #${PR_NUMBER} by ${MERGED_BY}: ${#FAILED_CHECKS[@]} required check(s) not green at merge time."
--- a/.gitea/scripts/ci-required-drift.py
+++ b/.gitea/scripts/ci-required-drift.py
@ -0,0 +1,591 @@
 #!/usr/bin/env python3
 """ci-required-drift — RFC internal#219 §4 + §6.
 Detects drift between three sources of "what counts as a required check"
 for this repo, files (or updates) a `[ci-drift]` Gitea issue when any
 pair diverges.
 Sources:
  A. `.gitea/workflows/ci.yml` jobs  (CI source — the actual job set)
  B. `status_check_contexts` in branch_protections (the merge gate)
  C. `REQUIRED_CHECKS` env in audit-force-merge.yml (the audit env)
 Three failure classes:
  F1  Job in (A) is not under the sentinel's `needs:` — sentinel
      doesn't gate it, so a red job on that name can sneak through.
      Ignores jobs whose `if:` references `github.event_name` (those
      run only on specific events and may be `skipped` legitimately).
  F2  Context in (B) corresponds to no emitter — i.e. there's no job
      in ci.yml whose runtime status-name maps to that context.
      A stale required-check name is silent: protection demands a
      green it never receives, but Gitea treats absent-as-pending,
      not absent-as-red. The gate degrades to advisory.
  F3  (B) and (C) are not set-equal. Audit env wider than protection
      → audit flags non-force-merges as force; narrower → real
      force-merges are missed.
 Idempotency:
  Searches OPEN issues by exact title prefix
  `[ci-drift] {repo}/{branch}: ` and either edits the existing one
  (if any) or POSTs a new one. Never spawns duplicates.
 Behavior-based AST gate per `feedback_behavior_based_ast_gates`:
  - Job set comes from PyYAML parse of jobs:* keys
  - Sentinel needs from PyYAML parse of jobs[sentinel].needs (a list)
  - Audit env from PyYAML parse, NOT grep — so reformatting the YAML
    (block-scalar `|` vs flow-style list) does not break the gate
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from typing import Any
 import yaml  # PyYAML 6.0.2 — installed by the workflow before this runs.
 # --------------------------------------------------------------------------
 # Environment
 # --------------------------------------------------------------------------
 def env(key: str, *, required: bool = True, default: str | None = None) -> str:
    val = os.environ.get(key, default)
    if required and not val:
        sys.stderr.write(f"::error::missing required env var: {key}\n")
        sys.exit(2)
    return val or ""
 GITEA_TOKEN = env("GITEA_TOKEN", required=False)
 GITEA_HOST = env("GITEA_HOST", required=False)
 REPO = env("REPO", required=False)
 BRANCHES = env("BRANCHES", required=False).split()
 SENTINEL_JOB = env("SENTINEL_JOB", required=False)
 AUDIT_WORKFLOW_PATH = env("AUDIT_WORKFLOW_PATH", required=False)
 CI_WORKFLOW_PATH = env("CI_WORKFLOW_PATH", required=False)
 DRIFT_LABEL = env("DRIFT_LABEL", required=False)
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
 def _require_runtime_env() -> None:
    """Enforce env contract — called from `main()` only. Tests import
    individual functions without setting the full env contract."""
    for key in (
        "GITEA_TOKEN",
        "GITEA_HOST",
        "REPO",
        "BRANCHES",
        "SENTINEL_JOB",
        "AUDIT_WORKFLOW_PATH",
        "CI_WORKFLOW_PATH",
        "DRIFT_LABEL",
    ):
        if not os.environ.get(key):
            sys.stderr.write(f"::error::missing required env var: {key}\n")
            sys.exit(2)
 # --------------------------------------------------------------------------
 # Tiny HTTP helper (no requests dependency)
 # --------------------------------------------------------------------------
 class ApiError(RuntimeError):
    """Raised when a Gitea API call cannot be trusted to have succeeded.
    Covers non-2xx HTTP status AND 2xx with an unparseable JSON body on
    endpoints that are documented to return JSON (search/read). Callers
    that swallow this and proceed would risk e.g. creating duplicate
    `[ci-drift]` issues when a transient 500 hides an existing match.
    The cron retries hourly; one fail-loud cycle is fine — silent
    duplicate creation is not (per Five-Axis review on PR #112).
    """
 def api(
    method: str,
    path: str,
    *,
    body: dict | None = None,
    query: dict[str, str] | None = None,
    expect_json: bool = True,
 ) -> tuple[int, Any]:
    """Tiny HTTP helper around urllib.
    Raises ApiError on any non-2xx response. Callers that want
    best-effort semantics (e.g. label-apply) must `try/except ApiError`
    explicitly — making the failure-soft path opt-in rather than the
    default closes the duplicate-issue regression class.
    For 2xx responses with a JSON body that fails to parse, raises
    ApiError when `expect_json=True` (the default for read-shaped
    paths). On endpoints that legitimately return non-JSON success
    bodies (e.g. some Gitea create echoes — see
    `feedback_gitea_create_api_unparseable_response`), callers may pass
    `expect_json=False` to accept a `_raw` fallthrough — but they MUST
    then verify success via a follow-up GET, not by trusting the body.
    """
    url = f"{API}{path}"
    if query:
        url = f"{url}?{urllib.parse.urlencode(query)}"
    data = None
    headers = {
        "Authorization": f"token {GITEA_TOKEN}",
        "Accept": "application/json",
    }
    if body is not None:
        data = json.dumps(body).encode("utf-8")
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, method=method, data=data, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = resp.read()
            status = resp.status
    except urllib.error.HTTPError as e:
        raw = e.read()
        status = e.code
    if not (200 <= status < 300):
        snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
        raise ApiError(
            f"{method} {path} → HTTP {status}: {snippet}"
        )
    if not raw:
        return status, None
    try:
        return status, json.loads(raw)
    except json.JSONDecodeError as e:
        if expect_json:
            raise ApiError(
                f"{method} {path} → HTTP {status} but body is not JSON: {e}"
            ) from e
        # Opt-in raw fallthrough for endpoints with known echo-quirks.
        return status, {"_raw": raw.decode("utf-8", errors="replace")}
 # --------------------------------------------------------------------------
 # YAML loaders — STRICT (reject GitHub-Actions-only syntax)
 # --------------------------------------------------------------------------
 def load_yaml(path: str) -> dict:
    """Load + parse a workflow YAML. Hard-fail if the file is missing
    or doesn't parse — drift-detect cannot make decisions without
    knowing the actual job set."""
    if not os.path.exists(path):
        sys.stderr.write(f"::error::file not found: {path}\n")
        sys.exit(3)
    with open(path, encoding="utf-8") as f:
        try:
            doc = yaml.safe_load(f)
        except yaml.YAMLError as e:
            sys.stderr.write(f"::error::YAML parse error in {path}: {e}\n")
            sys.exit(3)
    if not isinstance(doc, dict):
        sys.stderr.write(f"::error::{path} is not a YAML mapping\n")
        sys.exit(3)
    return doc
 def ci_jobs_all(ci_doc: dict) -> set[str]:
    """Every job key in ci.yml minus the sentinel itself. Used for F1b
    (sentinel.needs typo check) — needs that name a non-existent job
    is a typo regardless of event-gating."""
    jobs = ci_doc.get("jobs")
    if not isinstance(jobs, dict):
        sys.stderr.write("::error::ci.yml has no jobs: mapping\n")
        sys.exit(3)
    return {k for k in jobs if k != SENTINEL_JOB}
 def ci_job_names(ci_doc: dict) -> set[str]:
    """Set of job keys in ci.yml MINUS the sentinel itself MINUS jobs
    whose `if:` gates on `github.event_name` (those are event-scoped
    and can legitimately be `skipped` for a given trigger; if we
    required them under the sentinel `needs:`, every PR-only job
    would be `skipped` on push and the sentinel would interpret
    `skipped != success` as failure). RFC §4 spec.
    Used for F1 (jobs missing from sentinel needs). NOT used for F1b
    (typos in needs) — see `ci_jobs_all` for that."""
    jobs = ci_doc.get("jobs")
    if not isinstance(jobs, dict):
        sys.stderr.write("::error::ci.yml has no jobs: mapping\n")
        sys.exit(3)
    names: set[str] = set()
    for k, v in jobs.items():
        if k == SENTINEL_JOB:
            continue
        if isinstance(v, dict):
            gate = v.get("if")
            if isinstance(gate, str) and "github.event_name" in gate:
                continue
        names.add(k)
    return names
 def sentinel_needs(ci_doc: dict) -> set[str]:
    sentinel = ci_doc.get("jobs", {}).get(SENTINEL_JOB)
    if not isinstance(sentinel, dict):
        sys.stderr.write(
            f"::error::sentinel job '{SENTINEL_JOB}' not found in {CI_WORKFLOW_PATH}\n"
        )
        sys.exit(3)
    needs = sentinel.get("needs", [])
    if isinstance(needs, str):
        needs = [needs]
    if not isinstance(needs, list):
        sys.stderr.write("::error::sentinel `needs:` is neither list nor string\n")
        sys.exit(3)
    return set(needs)
 def required_checks_env(audit_doc: dict) -> set[str]:
    """Pull the REQUIRED_CHECKS env value from audit-force-merge.yml.
    Walks the YAML AST per `feedback_behavior_based_ast_gates`: we do
    NOT grep for `REQUIRED_CHECKS:` — that breaks under reformatting,
    multi-job workflows, or a future move of the env to a different
    step. Instead, look inside every job's every step's `env:` map."""
    found: list[str] = []
    jobs = audit_doc.get("jobs", {})
    if not isinstance(jobs, dict):
        sys.stderr.write(f"::warning::{AUDIT_WORKFLOW_PATH} has no jobs: mapping\n")
        return set()
    for job in jobs.values():
        if not isinstance(job, dict):
            continue
        for step in job.get("steps", []) or []:
            if not isinstance(step, dict):
                continue
            step_env = step.get("env") or {}
            if isinstance(step_env, dict) and "REQUIRED_CHECKS" in step_env:
                v = step_env["REQUIRED_CHECKS"]
                if isinstance(v, str):
                    found.append(v)
    if not found:
        sys.stderr.write(
            f"::error::REQUIRED_CHECKS env not found in any step of {AUDIT_WORKFLOW_PATH}\n"
        )
        sys.exit(3)
    if len(found) > 1:
        # Defensive: refuse to guess which one is canonical.
        sys.stderr.write(
            f"::error::REQUIRED_CHECKS env present in {len(found)} steps; ambiguous\n"
        )
        sys.exit(3)
    raw = found[0]
    # YAML block-scalars (`|`) leave a trailing newline + blanks; trim
    # consistently with audit-force-merge.sh's parser so both sides
    # produce identical sets.
    return {line.strip() for line in raw.splitlines() if line.strip()}
 # --------------------------------------------------------------------------
 # Mapping: ci.yml job-key  →  protection context name
 # --------------------------------------------------------------------------
 def expected_context(job_key: str, workflow_name: str = "ci") -> str:
    """Gitea Actions reports status-check contexts as
       "{workflow.name} / {job.name or job.key} ({event})".
    For ci.yml the event is `pull_request` on PRs (that's what
    `status_check_contexts` records). Job.name defaults to job.key
    when no `name:` is set. CP's ci.yml does NOT set per-job `name:`
    so the key equals the human-name."""
    return f"{workflow_name} / {job_key} (pull_request)"
 # --------------------------------------------------------------------------
 # Drift detection
 # --------------------------------------------------------------------------
 def detect_drift(branch: str) -> tuple[list[str], dict]:
    """Returns (findings, debug). Empty findings == no drift."""
    findings: list[str] = []
    ci_doc = load_yaml(CI_WORKFLOW_PATH)
    audit_doc = load_yaml(AUDIT_WORKFLOW_PATH)
    jobs = ci_job_names(ci_doc)
    jobs_all = ci_jobs_all(ci_doc)
    needs = sentinel_needs(ci_doc)
    env_set = required_checks_env(audit_doc)
    # Protection
    # api() raises ApiError on non-2xx; let it propagate so a transient
    # 500 fails the run loudly rather than producing a "no drift" lie.
    _, protection = api("GET", f"/repos/{OWNER}/{NAME}/branch_protections/{branch}")
    if not isinstance(protection, dict):
        sys.stderr.write(
            f"::error::protection response for {branch} not a JSON object\n"
        )
        sys.exit(4)
    contexts = set(protection.get("status_check_contexts") or [])
    # ----- F1: job exists in CI but not under sentinel.needs -----
    missing_from_needs = sorted(jobs - needs)
    if missing_from_needs:
        findings.append(
            "F1 — jobs in ci.yml NOT under sentinel `needs:` (sentinel doesn't gate them):\n"
            + "\n".join(f"  - {n}" for n in missing_from_needs)
        )
    # ----- F1b: needs lists a job that doesn't exist (typo) -----
    # Compare against jobs_all (incl. event-gated jobs); a typo is a
    # typo regardless of `if:` gating.
    stale_needs = sorted(needs - jobs_all)
    if stale_needs:
        findings.append(
            "F1b — sentinel `needs:` lists jobs NOT present in ci.yml (typo or removed job):\n"
            + "\n".join(f"  - {n}" for n in stale_needs)
        )
    # ----- F2: protection context has no emitting job -----
    # Compute the contexts the CI YAML actually produces. The sentinel
    # is in (B) intentionally (`ci / all-required (pull_request)`); we
    # whitelist it explicitly.
    emitted_contexts = {expected_context(j) for j in jobs} | {expected_context(SENTINEL_JOB)}
    # Contexts NOT produced by ci.yml may still come from other
    # workflows in the repo (Secret scan etc). We can't enumerate
    # every workflow's emissions cheaply; instead, flag only contexts
    # whose prefix is `ci / ` (this workflow's emissions) and which
    # don't appear in `emitted_contexts`. This narrows F2 to the
    # failure class the RFC actually targets without producing noise
    # from cross-workflow emitters.
    stale_protection = sorted(
        c for c in contexts if c.startswith("ci / ") and c not in emitted_contexts
    )
    if stale_protection:
        findings.append(
            "F2 — protection `status_check_contexts` entries with `ci / ` prefix that NO "
            "job in ci.yml emits (stale name → silent advisory gate):\n"
            + "\n".join(f"  - {c}" for c in stale_protection)
        )
    # ----- F3: audit env vs protection contexts (set-equal) -----
    only_in_env = sorted(env_set - contexts)
    only_in_protection = sorted(contexts - env_set)
    if only_in_env:
        findings.append(
            "F3a — audit-force-merge.yml `REQUIRED_CHECKS` env has contexts NOT in "
            f"branch_protections/{branch}.status_check_contexts (audit would flag "
            "non-force-merges as force):\n"
            + "\n".join(f"  - {c}" for c in only_in_env)
        )
    if only_in_protection:
        findings.append(
            "F3b — branch_protections/{br}.status_check_contexts has contexts NOT in "
            "audit-force-merge.yml `REQUIRED_CHECKS` env (real force-merges would be "
            "missed):\n".format(br=branch)
            + "\n".join(f"  - {c}" for c in only_in_protection)
        )
    debug = {
        "branch": branch,
        "ci_jobs": sorted(jobs),
        "sentinel_needs": sorted(needs),
        "protection_contexts": sorted(contexts),
        "audit_env_checks": sorted(env_set),
        "expected_contexts": sorted(emitted_contexts),
    }
    return findings, debug
 # --------------------------------------------------------------------------
 # Issue file/update
 # --------------------------------------------------------------------------
 def title_for(branch: str) -> str:
    # Idempotency key — keep stable, never include timestamp/SHA.
    return f"[ci-drift] {REPO}/{branch}: required-checks divergence detected"
 def find_open_issue(title: str) -> dict | None:
    """Return the existing open `[ci-drift]` issue for `title`, or None.
    `None` means "search succeeded, no match" — NOT "search failed".
    Per Five-Axis review on PR #112: returning None on a transient API
    error caused the caller to POST a duplicate issue. Now api() raises
    ApiError on any non-2xx; we let it propagate. The cron retries
    hourly; failing one cycle loudly is strictly better than silently
    duplicating.
    Gitea issue search returns at most page=50 per page; one page is
    enough as long as `[ci-drift]` issues are a tiny minority. (See
    follow-up issue for Link-header pagination.)
    """
    _, results = api(
        "GET",
        f"/repos/{OWNER}/{NAME}/issues",
        query={"state": "open", "type": "issues", "limit": "50"},
    )
    if not isinstance(results, list):
        raise ApiError(
            f"issue search returned non-list body (got {type(results).__name__})"
        )
    for issue in results:
        if issue.get("title") == title:
            return issue
    return None
 def render_body(branch: str, findings: list[str], debug: dict) -> str:
    body = [
        f"# Drift detected on `{REPO}/{branch}`",
        "",
        "Auto-filed by `.gitea/workflows/ci-required-drift.yml` "
        "(RFC [internal#219](https://git.moleculesai.app/molecule-ai/internal/issues/219) §4 + §6).",
        "",
        "## Findings",
        "",
    ]
    body.extend(findings)
    body.extend(
        [
            "",
            "## Resolution",
            "",
            "- **F1 / F1b**: add the missing job to `all-required.needs:` "
            "in `.gitea/workflows/ci.yml`, or remove the stale entry.",
            "- **F2**: rename the protection context to match an emitter, "
            "or remove it from `status_check_contexts` "
            "(PATCH `/api/v1/repos/{owner}/{repo}/branch_protections/{branch}`).",
            "- **F3a / F3b**: bring `REQUIRED_CHECKS` env in "
            "`.gitea/workflows/audit-force-merge.yml` into set-equality with "
            "`status_check_contexts` (single PR, both files).",
            "",
            "## Debug",
            "",
            "```json",
            json.dumps(debug, indent=2, sort_keys=True),
            "```",
            "",
            "_This issue is idempotent: drift-detect runs hourly at `:17` "
            "and edits this body in place. Close the issue once the drift "
            "is fixed; the next hourly run will reopen if drift returns._",
        ]
    )
    return "\n".join(body)
 def file_or_update(
    branch: str,
    findings: list[str],
    debug: dict,
    *,
    dry_run: bool = False,
 ) -> None:
    """File a new `[ci-drift]` issue, or PATCH the existing one in place.
    `dry_run=True` skips every side-effecting Gitea call (issue
    search, POST, PATCH, label apply) and prints the would-be issue
    title + body to stdout. Useful for local testing and for
    debugging drift output without polluting the issue tracker.
    """
    title = title_for(branch)
    body = render_body(branch, findings, debug)
    if dry_run:
        print(f"::notice::[dry-run] would file/update drift issue for {branch}")
        print(f"::group::[dry-run] title")
        print(title)
        print(f"::endgroup::")
        print(f"::group::[dry-run] body")
        print(body)
        print(f"::endgroup::")
        return
    existing = find_open_issue(title)
    if existing:
        num = existing["number"]
        api(
            "PATCH",
            f"/repos/{OWNER}/{NAME}/issues/{num}",
            body={"body": body},
        )
        print(f"::notice::Updated existing drift issue #{num} for {branch}")
        return
    _, created = api(
        "POST",
        f"/repos/{OWNER}/{NAME}/issues",
        body={"title": title, "body": body, "labels": []},
    )
    if not isinstance(created, dict):
        sys.stderr.write("::error::POST issue response not a JSON object\n")
        sys.exit(5)
    new_num = created.get("number")
    print(f"::warning::Filed new drift issue #{new_num} for {branch}")
    # Apply label by name (Gitea's add-labels endpoint accepts label IDs;
    # look up id by name once). Best-effort: failure to label is logged
    # but does not fail the audit run — the issue itself IS the alarm.
    try:
        _, labels = api("GET", f"/repos/{OWNER}/{NAME}/labels")
    except ApiError as e:
        sys.stderr.write(f"::warning::could not list labels: {e}\n")
        return
    label_id = None
    if isinstance(labels, list):
        for lbl in labels:
            if lbl.get("name") == DRIFT_LABEL:
                label_id = lbl.get("id")
                break
    if label_id is not None and new_num:
        try:
            api(
                "POST",
                f"/repos/{OWNER}/{NAME}/issues/{new_num}/labels",
                body={"labels": [label_id]},
            )
        except ApiError as e:
            sys.stderr.write(
                f"::warning::could not apply label '{DRIFT_LABEL}' to #{new_num}: {e}\n"
            )
    else:
        sys.stderr.write(f"::warning::label '{DRIFT_LABEL}' not found on repo\n")
 # --------------------------------------------------------------------------
 # Main
 # --------------------------------------------------------------------------
 def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog="ci-required-drift",
        description="Detect drift between ci.yml, branch_protections, "
        "and audit-force-merge.yml REQUIRED_CHECKS env.",
    )
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Detect + print findings to stdout; do NOT file or PATCH "
        "the `[ci-drift]` issue. Useful for local testing and for "
        "previewing output before turning the workflow loose.",
    )
    return p.parse_args(argv)
 def main(argv: list[str] | None = None) -> int:
    args = _parse_args(argv)
    _require_runtime_env()
    for branch in BRANCHES:
        findings, debug = detect_drift(branch)
        if findings:
            print(f"::warning::Drift detected on {branch}:")
            for f in findings:
                print(f)
            file_or_update(branch, findings, debug, dry_run=args.dry_run)
        else:
            print(f"::notice::No drift on {branch}.")
            print(json.dumps(debug, indent=2, sort_keys=True))
    # Exit 0 even on drift — the issue IS the alarm, not a red workflow.
    # A red workflow here would page on a CI rename until the issue is
    # opened, doubling the noise. The issue itself is the actionable
    # surface. (`api()` raising ApiError is the only path that exits
    # non-zero, by design: a transient Gitea outage should fail loudly.)
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/main-red-watchdog.py
+++ b/.gitea/scripts/main-red-watchdog.py
@ -0,0 +1,589 @@
 #!/usr/bin/env python3
 """main-red-watchdog — Option C of the "main NEVER goes red" directive.
 Tracking: molecule-core#420.
 What it does (one cron tick):
  1. GET /api/v1/repos/{owner}/{repo}/branches/{watch_branch}
     → current HEAD SHA on the watched branch.
  2. GET /api/v1/repos/{owner}/{repo}/commits/{SHA}/status
     → combined status + per-context statuses.
  3. If combined state is `failure` (or any individual status is
     `failure`): open or PATCH an idempotent
     `[main-red] {repo}: {SHA[:10]}` issue. Body lists each failed
     status context with `target_url` + `description`.
  4. If combined state is `success`: close any open `[main-red]
     {repo}: ...` issue on a previous SHA with a
     "main returned to green at SHA {current_SHA}" comment.
  5. Emit one Loki-shaped JSON line via `logger -t main-red-watchdog`
     so `reference_obs_stack_phase1`'s Vector → Loki path ingests an
     alert event (queryable in Grafana as
     `{tenant="operator-host"} |~ "main-red-watchdog"`).
 What it does NOT do:
  - Auto-revert anything. Option B is explicitly rejected per
    `feedback_no_such_thing_as_flakes` + `feedback_fix_root_not_symptom`.
  - Page on its own failures. If api() raises ApiError (transient
    Gitea outage), the workflow run fails LOUDLY by re-raise — exactly
    the contract `feedback_api_helper_must_raise_not_return_dict`
    enforces. Silent fallthrough would re-introduce the duplicate-issue
    regression class.
  - Exit non-zero on RED. The issue IS the alarm; failing the watchdog
    on red would double-page (red workflow + open issue) and create
    silent-loop risk if the watchdog itself flakes.
 Idempotency strategy:
  Title is keyed on `{SHA[:10]}` (commit-scoped), NOT just `main`.
  Rationale:
    - A fix-forward changes HEAD → next cron tick sees a new SHA;
      auto-close logic closes the prior `[main-red] OLD_SHA` issue and
      (if the new HEAD is also red, e.g. a different test fails) files
      a fresh `[main-red] NEW_SHA`. Lineage is preserved.
    - A revert that happens to land back on a previously-red SHA
      (rare) would refer to a CLOSED issue; the watchdog never reopens.
      That's a deliberate trade-off — the operator will see the latest
      open issue's `closed` event in the activity feed.
 This module is import-safe: tests import individual functions without
 invoking main(), so module-level reads use env-with-default and the
 runtime contract enforcement lives in `_require_runtime_env()`.
 Run locally (dry-run, no API mutation):
    GITEA_TOKEN=... GITEA_HOST=git.moleculesai.app REPO=owner/repo \\
      WATCH_BRANCH=main RED_LABEL=tier:high \\
      python3 .gitea/scripts/main-red-watchdog.py --dry-run
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import shutil
 import subprocess
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
 from typing import Any
 # --------------------------------------------------------------------------
 # Environment
 # --------------------------------------------------------------------------
 def _env(key: str, *, default: str = "") -> str:
    """Read an env var with a default. Module-import-safe — tests can
    import this script without setting the full env contract."""
    return os.environ.get(key, default)
 GITEA_TOKEN = _env("GITEA_TOKEN")
 GITEA_HOST = _env("GITEA_HOST")
 REPO = _env("REPO")
 WATCH_BRANCH = _env("WATCH_BRANCH", default="main")
 RED_LABEL = _env("RED_LABEL", default="tier:high")
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
 # Title prefix — kept short and stable so the idempotency search can
 # match by exact title without parsing.
 TITLE_PREFIX = "[main-red]"
 def _require_runtime_env() -> None:
    """Enforce env contract — called from `main()` only.
    Tests import individual functions without setting the full env
    contract. Mirrors the CP `ci-required-drift.py` pattern so the
    runtime guard is a single chokepoint.
    """
    for key in ("GITEA_TOKEN", "GITEA_HOST", "REPO", "WATCH_BRANCH", "RED_LABEL"):
        if not os.environ.get(key):
            sys.stderr.write(f"::error::missing required env var: {key}\n")
            sys.exit(2)
 # --------------------------------------------------------------------------
 # Tiny HTTP helper — raises on non-2xx + on JSON-decode-of-expected-JSON.
 # --------------------------------------------------------------------------
 class ApiError(RuntimeError):
    """Raised when a Gitea API call cannot be trusted to have succeeded.
    Covers non-2xx HTTP status AND 2xx with an unparseable JSON body on
    endpoints documented to return JSON. Callers that swallow this and
    proceed risk e.g. creating duplicate `[main-red]` issues when a
    transient 500 hides an existing match. Per
    `feedback_api_helper_must_raise_not_return_dict`: soft-failure is
    opt-in via `expect_json=False`, never the default.
    """
 def api(
    method: str,
    path: str,
    *,
    body: dict | None = None,
    query: dict[str, str] | None = None,
    expect_json: bool = True,
 ) -> tuple[int, Any]:
    """Tiny HTTP helper around urllib.
    Raises ApiError on any non-2xx response, and on JSON-decode failure
    when `expect_json=True` (the default for read-shaped paths). Mirrors
    the CP ci-required-drift.py contract exactly so behaviour is
    cross-checkable.
    """
    url = f"{API}{path}"
    if query:
        url = f"{url}?{urllib.parse.urlencode(query)}"
    data = None
    headers = {
        "Authorization": f"token {GITEA_TOKEN}",
        "Accept": "application/json",
    }
    if body is not None:
        data = json.dumps(body).encode("utf-8")
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, method=method, data=data, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = resp.read()
            status = resp.status
    except urllib.error.HTTPError as e:
        raw = e.read()
        status = e.code
    if not (200 <= status < 300):
        snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
        raise ApiError(f"{method} {path} → HTTP {status}: {snippet}")
    if not raw:
        return status, None
    try:
        return status, json.loads(raw)
    except json.JSONDecodeError as e:
        if expect_json:
            raise ApiError(
                f"{method} {path} → HTTP {status} but body is not JSON: {e}"
            ) from e
        # Opt-in raw fallthrough for endpoints with known echo-quirks
        # (`feedback_gitea_create_api_unparseable_response`). Caller
        # MUST verify success via a follow-up GET, not by trusting body.
        return status, {"_raw": raw.decode("utf-8", errors="replace")}
 # --------------------------------------------------------------------------
 # Gitea reads
 # --------------------------------------------------------------------------
 def get_head_sha(branch: str) -> str:
    """HEAD SHA of `branch`. Raises ApiError on non-2xx."""
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}")
    if not isinstance(body, dict):
        raise ApiError(f"branch {branch} response not a JSON object")
    commit = body.get("commit")
    if not isinstance(commit, dict):
        raise ApiError(f"branch {branch} response missing `commit` object")
    sha = commit.get("id") or commit.get("sha")
    if not isinstance(sha, str) or len(sha) < 7:
        raise ApiError(f"branch {branch} response has no usable commit SHA")
    return sha
 def get_combined_status(sha: str) -> dict:
    """Combined commit status for `sha`. Gitea returns:
        {
          "state": "success" | "failure" | "pending" | "error",
          "statuses": [
            {"context": "...", "state": "success|failure|pending|error",
             "target_url": "...", "description": "..."},
            ...
          ],
          ...
        }
    Raises ApiError on non-2xx.
    """
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(body, dict):
        raise ApiError(f"status for {sha} response not a JSON object")
    return body
 def is_red(status: dict) -> tuple[bool, list[dict]]:
    """Return (is_red, failed_statuses).
    A commit is "red" if combined state is `failure` OR any individual
    status entry is in {`failure`, `error`}. `pending` and `success`
    do not trip the watchdog — pending means CI is still running, and
    that's the normal state immediately after a merge.
    `failed_statuses` is the list of per-context entries whose own
    `state` is in the red set; useful for the issue body.
    """
    combined = status.get("state")
    statuses = status.get("statuses") or []
    red_states = {"failure", "error"}
    failed = [
        s for s in statuses
        if isinstance(s, dict) and s.get("state") in red_states
    ]
    return (combined in red_states or bool(failed), failed)
 # --------------------------------------------------------------------------
 # Issue file / update / close
 # --------------------------------------------------------------------------
 def title_for(sha: str) -> str:
    """Idempotency key — `[main-red] {repo}: {SHA[:10]}`.
    Commit-scoped. A fix-forward to a new SHA produces a new title; the
    prior issue auto-closes via `close_open_red_issues_for_other_shas`.
    """
    return f"{TITLE_PREFIX} {REPO}: {sha[:10]}"
 def list_open_red_issues() -> list[dict]:
    """All open issues whose title starts with `[main-red] {repo}: `.
    Per Five-Axis review on CP#112 (`feedback_api_helper_must_raise_not_return_dict`):
    api() raises on non-2xx; we let it propagate. Returning [] on a
    transient 500 would cause auto-close to skip the cleanup AND the
    file-or-update path to POST a duplicate — exactly the regression
    class the helper-raises contract closes.
    Gitea issue search returns at most 50/page; we only need open
    `[main-red]` issues which are by design ≤ 1 at any time per repo,
    so a single page is enough.
    """
    _, results = api(
        "GET",
        f"/repos/{OWNER}/{NAME}/issues",
        query={"state": "open", "type": "issues", "limit": "50"},
    )
    if not isinstance(results, list):
        raise ApiError(
            f"issue search returned non-list body (got {type(results).__name__})"
        )
    prefix = f"{TITLE_PREFIX} {REPO}: "
    return [i for i in results if isinstance(i, dict)
            and isinstance(i.get("title"), str)
            and i["title"].startswith(prefix)]
 def find_open_issue_for_sha(sha: str) -> dict | None:
    """Return the existing open `[main-red] {repo}: {SHA[:10]}` issue,
    or None if no such issue is open.
    `None` means "search succeeded, no match" — NOT "search failed".
    api() raises ApiError on any non-2xx; the caller can let that
    propagate so a transient outage fails loudly instead of silently
    duplicating.
    """
    target = title_for(sha)
    for issue in list_open_red_issues():
        if issue.get("title") == target:
            return issue
    return None
 def render_body(sha: str, failed: list[dict], debug: dict) -> str:
    """Issue body. Markdown. Mirrors CP#112's render_body shape."""
    lines = [
        f"# Main is RED on `{REPO}` at `{sha[:10]}`",
        "",
        f"Commit: <https://{GITEA_HOST}/{REPO}/commit/{sha}>",
        "",
        "Auto-filed by `.gitea/workflows/main-red-watchdog.yml` (Option C "
        "of the [main-never-red directive]"
        f"(https://{GITEA_HOST}/molecule-ai/molecule-core/issues/420)). "
        "Per `feedback_no_such_thing_as_flakes` + "
        "`feedback_fix_root_not_symptom`: investigate the root cause; do "
        "NOT revert as a reflex. The watchdog itself never reverts.",
        "",
        "## Failed status contexts",
        "",
    ]
    if not failed:
        lines.append(
            "_(Combined state reported `failure`/`error` but no per-context "
            "entries were in a red state. This usually means a CI emitter "
            "set combined-status directly without a per-context status. "
            "Check the most recent workflow run for `main` and trace from "
            "there.)_"
        )
    else:
        for s in failed:
            ctx = s.get("context", "(no context)")
            state = s.get("state", "(no state)")
            url = s.get("target_url") or ""
            desc = (s.get("description") or "").strip()
            entry = f"- **{ctx}** — `{state}`"
            if url:
                entry += f" → [logs]({url})"
            if desc:
                entry += f"\n  - {desc}"
            lines.append(entry)
    lines.extend([
        "",
        "## Resolution path",
        "",
        "1. Read the failed logs (links above).",
        "2. If reproducible locally, fix forward in a PR targeting `main`.",
        "3. If the failure is a real flake — STOP. Per "
        "`feedback_no_such_thing_as_flakes`, intermittent failures are "
        "real bugs. Investigate to root cause; do not mark as flake.",
        "4. If the failure is blocking unrelated work for >1 hour, file a "
        "follow-up issue and assign someone. Do NOT revert without a "
        "human GO per `feedback_prod_apply_needs_hongming_chat_go` "
        "(branch protection is a prod surface).",
        "",
        "## Debug",
        "",
        "```json",
        json.dumps(debug, indent=2, sort_keys=True),
        "```",
        "",
        "_This issue is idempotent: the watchdog runs hourly at `:05` "
        "and edits this body in place. When `main` returns to green, the "
        "watchdog will close this issue automatically with a "
        "\"main returned to green\" comment._",
    ])
    return "\n".join(lines)
 def emit_loki_event(event_type: str, sha: str, failed_contexts: list[str]) -> None:
    """Emit a JSON line to syslog tag `main-red-watchdog` for
    `reference_obs_stack_phase1` (Vector → Loki).
    Best-effort: if `logger` isn't on PATH (e.g. local dev macOS without
    util-linux logger), print to stderr instead. The Gitea Actions
    Ubuntu runner has util-linux preinstalled.
    Loki labels: the workflow runs on the Ubuntu runner where Vector is
    NOT configured (Vector lives on the operator host + tenants per
    `reference_obs_stack_phase1`). The Loki line is still emitted as
    stdout JSON so the workflow log itself is parseable; treat the
    syslog call as belt-and-braces for the cases where this script is
    invoked from a host that DOES have Vector (e.g. operator-host cron
    fallback in a follow-up PR).
    """
    payload = {
        "event_type": event_type,
        "repo": REPO,
        "sha": sha,
        "failed_contexts": failed_contexts,
    }
    line = json.dumps(payload, sort_keys=True)
    # Always print to stdout so the workflow log captures it (machine-
    # readable; `gitea run logs` + Loki ingestion via the operator-host
    # journald → Vector → Loki path will see this from runners that
    # forward stdout). Loki query:
    #   {source="gitea-actions"} |~ "main_red_detected"
    print(f"main-red-watchdog event: {line}")
    # Best-effort syslog tag so a future "run from operator-host cron"
    # path picks it up directly via the existing Vector pipeline.
    if shutil.which("logger"):
        try:
            subprocess.run(
                ["logger", "-t", "main-red-watchdog", line],
                check=False,
                timeout=5,
            )
        except (OSError, subprocess.SubprocessError) as e:
            sys.stderr.write(f"::warning::logger call failed: {e}\n")
 def file_or_update_red(
    sha: str,
    failed: list[dict],
    debug: dict,
    *,
    dry_run: bool = False,
 ) -> None:
    """Open a new `[main-red] {repo}: {SHA[:10]}` issue, or PATCH the
    existing one's body. Idempotent by title."""
    title = title_for(sha)
    body = render_body(sha, failed, debug)
    if dry_run:
        print(f"::notice::[dry-run] would file/update main-red issue for {sha[:10]}")
        print("::group::[dry-run] title")
        print(title)
        print("::endgroup::")
        print("::group::[dry-run] body")
        print(body)
        print("::endgroup::")
        return
    existing = find_open_issue_for_sha(sha)
    if existing:
        num = existing["number"]
        api("PATCH", f"/repos/{OWNER}/{NAME}/issues/{num}", body={"body": body})
        print(f"::notice::Updated existing main-red issue #{num} for {sha[:10]}")
        return
    _, created = api(
        "POST",
        f"/repos/{OWNER}/{NAME}/issues",
        body={"title": title, "body": body, "labels": []},
    )
    if not isinstance(created, dict):
        raise ApiError("POST issue response not a JSON object")
    new_num = created.get("number")
    print(f"::warning::Filed new main-red issue #{new_num} for {sha[:10]}")
    # Apply RED_LABEL by id. Gitea's add-labels endpoint takes IDs, not
    # names (`feedback_gitea_label_delete_by_id` — same rule for add).
    # Best-effort: label failure is logged but does not fail the run.
    try:
        _, labels = api("GET", f"/repos/{OWNER}/{NAME}/labels")
    except ApiError as e:
        sys.stderr.write(f"::warning::could not list labels: {e}\n")
        return
    label_id = None
    if isinstance(labels, list):
        for lbl in labels:
            if isinstance(lbl, dict) and lbl.get("name") == RED_LABEL:
                label_id = lbl.get("id")
                break
    if label_id is not None and new_num:
        try:
            api(
                "POST",
                f"/repos/{OWNER}/{NAME}/issues/{new_num}/labels",
                body={"labels": [label_id]},
            )
        except ApiError as e:
            sys.stderr.write(
                f"::warning::could not apply label '{RED_LABEL}' to #{new_num}: {e}\n"
            )
    else:
        sys.stderr.write(f"::warning::label '{RED_LABEL}' not found on repo\n")
 def close_open_red_issues_for_other_shas(
    current_sha: str,
    *,
    dry_run: bool = False,
 ) -> int:
    """When main is green at current_sha, close any open `[main-red]`
    issues whose title references a different SHA. Returns the number
    of issues closed.
    Lineage note: we only close issues whose title prefix matches; if
    a human renamed the issue or added a suffix this won't touch it.
    That's intentional — manual editorial state takes precedence.
    """
    target_title = title_for(current_sha)
    open_red = list_open_red_issues()
    closed = 0
    for issue in open_red:
        if issue.get("title") == target_title:
            # Same SHA — caller should not have invoked this if main is
            # green. Skip defensively.
            continue
        num = issue.get("number")
        if not isinstance(num, int):
            continue
        comment = (
            f"`main` returned to green at SHA `{current_sha}` "
            f"(<https://{GITEA_HOST}/{REPO}/commit/{current_sha}>). "
            "Closing automatically. If the underlying root cause is "
            "not yet understood, reopen this issue and file a "
            "postmortem — green-by-flake is still a bug per "
            "`feedback_no_such_thing_as_flakes`."
        )
        if dry_run:
            print(f"::notice::[dry-run] would close issue #{num} ({issue.get('title')})")
            closed += 1
            continue
        # Comment first, then close. Order matters: a closed issue can
        # still receive comments, but the activity-feed ordering reads
        # better with the explanation arriving just before the close.
        api(
            "POST",
            f"/repos/{OWNER}/{NAME}/issues/{num}/comments",
            body={"body": comment},
        )
        api(
            "PATCH",
            f"/repos/{OWNER}/{NAME}/issues/{num}",
            body={"state": "closed"},
        )
        print(f"::notice::Closed main-red issue #{num} (green at {current_sha[:10]})")
        closed += 1
    return closed
 # --------------------------------------------------------------------------
 # Main
 # --------------------------------------------------------------------------
 def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog="main-red-watchdog",
        description="Detect post-merge CI red on the watched branch and "
        "file an idempotent issue. Option C of the main-never-red directive.",
    )
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Detect + print the would-be issue title/body to stdout; do "
        "NOT POST/PATCH/close any issues. Useful for local testing.",
    )
    return p.parse_args(argv)
 def run_once(*, dry_run: bool = False) -> int:
    """One watchdog tick. Returns 0 on green or red-issue-filed; lets
    ApiError propagate on transient outage (workflow run fails loudly,
    which is correct per the helper-raises contract)."""
    sha = get_head_sha(WATCH_BRANCH)
    status = get_combined_status(sha)
    red, failed = is_red(status)
    debug = {
        "branch": WATCH_BRANCH,
        "sha": sha,
        "combined_state": status.get("state"),
        "failed_contexts": [s.get("context") for s in failed],
        "all_contexts": [
            {"context": s.get("context"), "state": s.get("state")}
            for s in (status.get("statuses") or [])
            if isinstance(s, dict)
        ],
    }
    if red:
        failed_ctxs = [s.get("context") for s in failed if s.get("context")]
        emit_loki_event("main_red_detected", sha, failed_ctxs)
        print(f"::warning::main is RED at {sha[:10]} on {WATCH_BRANCH}: "
              f"{len(failed)} failed context(s)")
        file_or_update_red(sha, failed, debug, dry_run=dry_run)
    else:
        # Green (or pending — pending is treated as not-red so we don't
        # spam during the post-merge CI window). Close any stale issues
        # from earlier SHAs only when we're actually green; pending
        # means CI hasn't finished and the prior issue might still be
        # accurate.
        if status.get("state") == "success":
            closed = close_open_red_issues_for_other_shas(sha, dry_run=dry_run)
            if closed:
                emit_loki_event(
                    "main_returned_to_green", sha,
                    [],
                )
            print(f"::notice::main is GREEN at {sha[:10]} on {WATCH_BRANCH} "
                  f"(closed {closed} stale issue(s))")
        else:
            print(f"::notice::main is PENDING at {sha[:10]} on {WATCH_BRANCH} "
                  f"(combined state={status.get('state')!r}; no action)")
    return 0
 def main(argv: list[str] | None = None) -> int:
    args = _parse_args(argv)
    _require_runtime_env()
    return run_once(dry_run=args.dry_run)
 if __name__ == "__main__":
    sys.exit(main())
--- a/.gitea/scripts/sop-tier-check.sh
+++ b/.gitea/scripts/sop-tier-check.sh
@ -0,0 +1,379 @@
 #!/usr/bin/env bash
 # sop-tier-check — verify a Gitea PR satisfies the §SOP-6 approval gate.
 #
 # Reads the PR's tier label, walks approving reviewers, and checks team
 # membership against the tier's approval expression. Passes only when
 # ALL clauses in the expression are satisfied by the set of approving
 # reviewers (AND-composition; internal#189).
 #
 # Expression syntax:
 #   "team-a"          — OR-set: any ONE of the comma-separated teams
 #   "team-a AND team-b" — AND: BOTH must each have ≥1 approver
 #   "(a,b,c)"         — OR-set wrapped in parens; same as "a,b,c"
 #
 # Example: "qa AND security AND (managers,ceo)" means:
 #   ≥1 approver in team "qa"  AND
 #   ≥1 approver in team "security"  AND
 #   ≥1 approver in team "managers" OR "ceo"
 #
 # Per the spec (internal#189), the hard gate here pairs with the
 # advisory gate of sop-conformance LLM-judge (internal#188): each
 # required-team click must reflect real verification (visible in review
 # body or A2A messages), not rubber-stamp APPROVE. Both gates together
 # close the "teammate clicks APPROVE without verifying" gap.
 #
 # Invoked from `.gitea/workflows/sop-tier-check.yml`. The workflow sets
 # the env vars below; this script does no IO outside of stdout/stderr +
 # the Gitea API.
 #
 # Required env:
 #   GITEA_TOKEN   — bot PAT with read:organization,read:user,
 #                   read:issue,read:repository scopes
 #   GITEA_HOST    — e.g. git.moleculesai.app
 #   REPO          — owner/name (from github.repository)
 #   PR_NUMBER     — int (from github.event.pull_request.number)
 #   PR_AUTHOR     — login (from github.event.pull_request.user.login)
 #
 # Optional:
 #   SOP_DEBUG=1        — print per-API-call diagnostic lines. Default: off.
 #   SOP_LEGACY_CHECK=1 — revert to OR-gate (≥1 approver from any eligible
 #                         team). Grace window for PRs in-flight when the
 #                         new AND-composition was deployed. Expires 2026-05-17
 #                         (7-day burn-in window; internal#189 Phase 1).
 #                         Set by workflow for PRs merged before the deploy.
 set -euo pipefail
 # Ensure jq is available. Runners may not have it pre-installed, and the
 # workflow-level jq install can fail on runners with network restrictions
 # (GitHub releases not reachable from some runner networks — infra#241
 # follow-up). This fallback is idempotent — no-op when jq is already on PATH.
 # SOP_FAIL_OPEN=1 makes this always exit 0 so CI never blocks on jq absence.
 if ! command -v jq >/dev/null 2>&1; then
  echo "::notice::jq not found on PATH — attempting install..."
  _jq_installed="no"
  # apt-get first (primary) — Ubuntu package mirrors are reliably reachable.
  if apt-get update -qq && apt-get install -y -qq jq 2>/dev/null; then
    echo "::notice::jq installed via apt-get: $(jq --version)"
    _jq_installed="yes"
  # GitHub binary as secondary fallback — may fail on restricted networks.
  elif timeout 120 curl -sSL \
    "https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64" \
    -o /usr/local/bin/jq \
    && chmod +x /usr/local/bin/jq; then
    echo "::notice::jq binary downloaded: $(/usr/local/bin/jq --version)"
    _jq_installed="yes"
  fi
  if ! command -v jq >/dev/null 2>&1; then
    echo "::error::jq installation failed — apt-get and GitHub binary both failed."
    echo "::error::sop-tier-check requires jq for all JSON API parsing."
    # SOP_FAIL_OPEN=1 is set in the workflow step's env — makes script always
    # exit 0 so CI never blocks. The SOP-6 tier review gate remains enforced.
    if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
      echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
      exit 0
    fi
    exit 1
  fi
 fi
 debug() {
  if [ "${SOP_DEBUG:-}" = "1" ]; then
    echo "  [debug] $*" >&2
  fi
 }
 # Validate env
 : "${GITEA_TOKEN:?GITEA_TOKEN required}"
 : "${GITEA_HOST:?GITEA_HOST required}"
 : "${REPO:?REPO required (owner/name)}"
 : "${PR_NUMBER:?PR_NUMBER required}"
 : "${PR_AUTHOR:?PR_AUTHOR required}"
 OWNER="${REPO%%/*}"
 NAME="${REPO##*/}"
 API="https://${GITEA_HOST}/api/v1"
 AUTH="Authorization: token ${GITEA_TOKEN}"
 echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUTHOR"
 # Sanity: token resolves to a user
 WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""')
 if [ -z "$WHOAMI" ]; then
  echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly."
  exit 1
 fi
 echo "::notice::token resolves to user: $WHOAMI"
 # 1. Read tier label
 LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name')
 TIER=""
 for L in $LABELS; do
  case "$L" in
    tier:low|tier:medium|tier:high)
      if [ -n "$TIER" ]; then
        echo "::error::Multiple tier labels: $TIER + $L. Apply exactly one."
        exit 1
      fi
      TIER="$L"
    ;;
  esac
 done
 if [ -z "$TIER" ]; then
  echo "::error::PR has no tier:low|tier:medium|tier:high label. Apply one before merge."
  exit 1
 fi
 debug "tier=$TIER"
 # 2. Tier → required team expression (AND-composition; internal#189)
 #
 # Expression syntax:
 #   clause-a AND clause-b AND ...   — ALL clauses must pass
 #   team-a,team-b,team-c            — OR-set: ≥1 approver in ANY of these teams
 #   (team-a,team-b)                 — same as team-a,team-b (parens optional)
 #
 # This map is the single source of truth. Update it when the team structure
 # or policy changes. Teams referenced here but absent in Gitea are treated
 # as unachievable (would always fail) — operators notice the clear error
 # and create the missing team.
 #
 # Current Gitea teams: ceo, engineers, managers
 # Future teams (create before removing "???" fallback): qa, security, security-audit
 declare -A TIER_EXPR=(
  # tier:low — same as previous OR gate: any engineer, manager, or ceo.
  ["tier:low"]="engineers,managers,ceo"
  # tier:medium — AND of (managers) AND (engineers) AND (qa???,security???)
  # The qa+security clause requires both teams to exist; when not yet
  # created, the PR author is responsible for adding them before requesting
  # approval on a tier:medium PR. Ops: create qa + security Gitea teams
  # and update this map to remove the "???" markers (internal#189 follow-up).
  ["tier:medium"]="managers AND engineers AND qa???,security???"
  # tier:high — ceo only. The AND-composition adds no value for a
  # single-team gate, but the framework is wired for consistency.
  ["tier:high"]="ceo"
 )
 EXPR="${TIER_EXPR[$TIER]-}"
 if [ -z "$EXPR" ]; then
  echo "::error::No expression defined for tier $TIER in TIER_EXPR map."
  exit 1
 fi
 debug "expression=$EXPR"
 # 3. Legacy OR-gate override (7-day burn-in grace window; internal#189 Phase 1)
 if [ "${SOP_LEGACY_CHECK:-}" = "1" ]; then
  LEGACY_ELIGIBLE=""
  case "$TIER" in
    tier:low)    LEGACY_ELIGIBLE="engineers managers ceo" ;;
    tier:medium) LEGACY_ELIGIBLE="managers ceo" ;;
    tier:high)   LEGACY_ELIGIBLE="ceo" ;;
  esac
  echo "::notice::SOP_LEGACY_CHECK=1 — using OR-gate ({$LEGACY_ELIGIBLE}) for this PR."
  ELIGIBLE="$LEGACY_ELIGIBLE"
 fi
 # 4. Resolve all team names → IDs
 # /orgs/{org}/teams/{slug}/... endpoints don't exist on Gitea 1.22;
 # we use /teams/{id}.
 ORG_TEAMS_FILE=$(mktemp)
 trap 'rm -f "$ORG_TEAMS_FILE"' EXIT
 HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \
  "${API}/orgs/${OWNER}/teams")
 debug "teams-list HTTP=$HTTP_CODE size=$(wc -c <"$ORG_TEAMS_FILE")"
 if [ "${SOP_DEBUG:-}" = "1" ]; then
  echo "  [debug] teams-list body (first 300 chars):" >&2
  head -c 300 "$ORG_TEAMS_FILE" >&2; echo >&2
 fi
 if [ "$HTTP_CODE" != "200" ]; then
  echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope."
  exit 1
 fi
 # Collect every team name that appears in the expression.
 # Bash word-splitting on $EXPR splits on spaces, so "AND" appears as a
 # token. We skip it explicitly.
 declare -A TEAM_ID
 _all_teams=""
 for _raw_clause in $EXPR; do
  # Strip parens and split on comma.
  _clause=${_raw_clause//[()]/}
  for _t in $(echo "$_clause" | tr ',' '\n'); do
    _t=$(echo "$_t" | tr -d '[:space:]')
    [ -z "$_t" ] && continue
    # Skip AND / OR operator tokens (bash word-split produced them from
    # spaces in the expression string).
    [ "$_t" = "AND" ] || [ "$_t" = "OR" ] && continue
    # Skip if already in set.
    case " $_all_teams " in
      *" $_t "*) ;;  # already present
      *) _all_teams="${_all_teams} $_t " ;;
    esac
  done
 done
 for _t in $_all_teams; do
  _t=$(echo "$_t" | tr -d ' ')
  [ -z "$_t" ] && continue
  _id=$(jq -r --arg t "$_t" '.[] | select(.name==$t) | .id' <"$ORG_TEAMS_FILE" | head -1)
  if [ -z "$_id" ] || [ "$_id" = "null" ]; then
    # "??" suffix marks teams that don't exist yet (tier:medium qa/security).
    # Treat as permanently failing clause; clear error message guides ops.
    if [[ "$_t" == *"???" ]]; then
      debug "team \"$_t\" not found (expected — pending team creation per internal#189)"
      continue
    fi
    _visible=$(jq -r '.[]?.name? // empty' <"$ORG_TEAMS_FILE" 2>/dev/null | tr '\n' ' ')
    echo "::error::Team \"$_t\" referenced in tier $TIER expression but not found in org $OWNER. Teams visible: $_visible"
    exit 1
  fi
  TEAM_ID[$_t]="$_id"
  debug "team-id: $_t → $_id"
 done
 # 5. Read approving reviewers
 REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
 APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]')
 if [ -z "$APPROVERS" ]; then
  echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics."
  exit 1
 fi
 debug "approvers: $(echo "$APPROVERS" | tr '\n' ' ')"
 # 6. For each approver: skip self-review; probe team membership by id.
 # Build $APPROVER_TEAMS[<user>]=space-surrounded team names (e.g. " managers ").
 # Pre/post spaces ensure case patterns *${_t}* match even when the name
 # is the first or last entry (bash case *word* needs delimiters on both sides).
 #
 # FALLBACK: if ALL team probes return 403 (token lacks read:org scope),
 # fall back to /orgs/{org}/members/{user}. This returns 204 for any org
 # member — a superset of team membership. Accepting it as a fallback means
 # the gate passes when the token is scoped to repo+user only (core-bot PAT).
 # This is safe because: (a) org membership is a prerequisite for every
 # eligible team; (b) the AND-composition of internal#189 still requires
 # multiple independent approvers; (c) any token with read:repository can
 # see the approving reviews, so bypass requires a colluding approver.
 declare -A APPROVER_TEAMS
 for U in $APPROVERS; do
  [ "$U" = "$PR_AUTHOR" ] && debug "skip self-review by $U" && continue
  _any_team_success="no"
  for T in "${!TEAM_ID[@]}"; do
    ID="${TEAM_ID[$T]}"
    CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
      "${API}/teams/${ID}/members/${U}")
    debug "probe: $U in team $T (id=$ID) → HTTP $CODE"
    if [ "$CODE" = "200" ] || [ "$CODE" = "204" ]; then
      APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
      debug "$U qualifies for team $T"
      _any_team_success="yes"
    fi
  done
  # Fallback: if every team probe returned 403, try org membership.
  # "??" teams were never resolved to IDs so they never entered the loop.
  # If the user is an org member, credit them as being in each queried team
  # (engineers, managers, ceo are all org-level). This is safe because org
  # membership is a prerequisite for all three, and bypass requires a colluding
  # approver (same risk as before the AND-composition).
  if [ "$_any_team_success" = "no" ]; then
    ORG_CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
      "${API}/orgs/${OWNER}/members/${U}")
    debug "probe: $U in org $OWNER (fallback) → HTTP $ORG_CODE"
    if [ "$ORG_CODE" = "204" ]; then
      for T in "${!TEAM_ID[@]}"; do
        APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
      done
      debug "$U credited as org member for all queried teams (fallback — token may lack read:org)"
    fi
  fi
 done
 # 7. Evaluate the tier expression.
 #
 # legacy OR-gate: use the simplified loop from before internal#189.
 if [ -n "${LEGACY_ELIGIBLE:-}" ]; then
  OK=""
  for _u in "${!APPROVER_TEAMS[@]}"; do
    for _t2 in $LEGACY_ELIGIBLE; do
      case "${APPROVER_TEAMS[$_u]}" in
        *${_t2}*)
          echo "::notice::approver $_u is in team $_t2 (eligible for $TIER)"
          OK="yes"
          break
        ;;
      esac
    done
    [ -n "$OK" ] && break
  done
  if [ -z "$OK" ]; then
    echo "::error::Tier $TIER requires approval from a non-author member of {$LEGACY_ELIGIBLE}. Set SOP_DEBUG=1 to see per-probe HTTP codes."
    exit 1
  fi
  echo "::notice::sop-tier-check passed: $TIER (legacy OR-gate)"
  exit 0
 fi
 # AND-gate: evaluate the expression clause by clause.
 # _passed_clauses and _failed_clauses accumulate for the status description.
 _passed_clauses=""
 _failed_clauses=""
 for _raw_clause in $EXPR; do
  # Normalise: strip parens, replace commas with spaces so bash word-split
  # can iterate the OR-set members. The previous form
  #   _clause=$(echo ... | tr ',' '\n' | tr -d '[:space:]' | grep -v '^$')
  # collapsed every member into one concatenated token because
  # `tr -d '[:space:]'` strips the very newlines that just separated them
  # ("engineers,managers,ceo" -> "engineersmanagersceo"), so the OR-clause
  # only ever evaluated as a single nonsense team name and never matched
  # APPROVER_TEAMS. Fixed in #229: leave the comma-separated members as
  # space-separated tokens for `for _t in $_clause`.
  _no_parens=${_raw_clause//[()]/}
  _clause=${_no_parens//,/ }
  _clause_passed="no"
  _clause_names=""
  for _t in $_clause; do
    # Append (don't overwrite) team name to the human-readable accumulator.
    # The previous form `_clause_names="${_clause_names:+, }${_t}"`
    # rewrote the variable on every iteration, so the FAIL message only
    # ever showed the LAST team. Fixed: prepend prior value before the
    # comma-separator, then append the new team name.
    _clause_names="${_clause_names}${_clause_names:+, }${_t}"
    # Skip teams not yet in Gitea (qa??? / security??? placeholders).
    [[ "$_t" == *"???" ]] && debug "clause \"$_t\": skipped (team pending creation)" && continue
    [ -z "${TEAM_ID[$_t]:-}" ] && debug "clause \"$_t\": no ID resolved, skipping" && continue
    for _u in "${!APPROVER_TEAMS[@]}"; do
      # Note: APPROVER_TEAMS values are space-surrounded (e.g. " managers ").
      # Pattern *${_t}* matches team name anywhere in the space-padded string.
      case "${APPROVER_TEAMS[$_u]}" in
        *${_t}*)
          _clause_passed="yes"
          debug "clause \"$_t\": satisfied by $_u"
          break
        ;;
      esac
    done
  done
  # Label for display: strip "???" from pending teams.
  _label=$(echo "$_raw_clause" | tr -d '()' | tr ',' '/' | tr -d '[:space:]' | sed 's/???//g')
  if [ "$_clause_passed" = "yes" ]; then
    # Append (don't overwrite) — same accumulator bug as _clause_names above.
    _passed_clauses="${_passed_clauses}${_passed_clauses:+, }$_label"
    echo "::notice::clause [$_label]: PASS — satisfied by approving reviewer(s)"
  else
    _failed_clauses="${_failed_clauses}${_failed_clauses:+, }$_label"
    echo "::error::clause [$_label]: FAIL — no approving reviewer belongs to any of these teams (${_clause_names}). Set SOP_DEBUG=1 to see per-team probe results."
  fi
 done
 if [ -n "$_failed_clauses" ]; then
  echo ""
  echo "::error::sop-tier-check FAILED for $TIER."
  echo "  Passed :${_passed_clauses}"
  echo "  Missing:${_failed_clauses}"
  echo "  All clauses must be satisfied. Each missing team needs an APPROVED review from one of its members."
  exit 1
 fi
 echo "::notice::sop-tier-check PASSED: $TIER — all required clauses satisfied [${_passed_clauses}]"
--- a/.gitea/scripts/sop-tier-refire.sh
+++ b/.gitea/scripts/sop-tier-refire.sh
@ -0,0 +1,172 @@
 #!/usr/bin/env bash
 # sop-tier-refire — re-evaluate sop-tier-check and POST status to PR head SHA.
 #
 # Invoked from `.gitea/workflows/sop-tier-refire.yml` when a repo
 # MEMBER/OWNER/COLLABORATOR comments `/refire-tier-check` on a PR.
 #
 # Behavior:
 #
 # 1. Resolve PR head SHA + author from PR_NUMBER.
 # 2. Rate-limit: if the sop-tier-check context has been POSTed in the
 #    last 30 seconds, skip (prevents comment-spam status thrash).
 # 3. Invoke `.gitea/scripts/sop-tier-check.sh` with the same env the
 #    canonical workflow provides. This is DRY: we re-use the exact AND-
 #    composition gate logic, not a watered-down approving-count check.
 # 4. POST the resulting status (success on exit 0, failure on non-zero)
 #    to `/repos/.../statuses/{HEAD_SHA}` with context
 #    "sop-tier-check / tier-check (pull_request)" — the same context name
 #    branch protection requires.
 #
 # Required env (set by sop-tier-refire.yml):
 #   GITEA_TOKEN    — org-level SOP_TIER_CHECK_TOKEN (read:org/user/issue/repo)
 #   GITEA_HOST     — e.g. git.moleculesai.app
 #   REPO           — owner/name
 #   PR_NUMBER      — PR number from issue_comment payload
 #   COMMENT_AUTHOR — login of the commenter (logged for audit)
 #
 # Optional:
 #   SOP_DEBUG=1                — verbose per-API-call diagnostics
 #   SOP_REFIRE_RATE_LIMIT_SEC  — override the 30s rate-limit (default 30)
 #   SOP_REFIRE_DISABLE_RATE_LIMIT=1 — for tests; skips the rate-limit check
 set -euo pipefail
 debug() {
  if [ "${SOP_DEBUG:-}" = "1" ]; then
    echo "  [debug] $*" >&2
  fi
 }
 : "${GITEA_TOKEN:?GITEA_TOKEN required}"
 : "${GITEA_HOST:?GITEA_HOST required}"
 : "${REPO:?REPO required (owner/name)}"
 : "${PR_NUMBER:?PR_NUMBER required}"
 : "${COMMENT_AUTHOR:=unknown}"
 OWNER="${REPO%%/*}"
 NAME="${REPO##*/}"
 API="https://${GITEA_HOST}/api/v1"
 AUTH="Authorization: token ${GITEA_TOKEN}"
 CONTEXT="sop-tier-check / tier-check (pull_request)"
 RATE_LIMIT_SEC="${SOP_REFIRE_RATE_LIMIT_SEC:-30}"
 echo "::notice::sop-tier-refire start: repo=$OWNER/$NAME pr=$PR_NUMBER commenter=$COMMENT_AUTHOR"
 # 1. Fetch PR details — need head.sha and user.login.
 PR_FILE=$(mktemp)
 trap 'rm -f "$PR_FILE"' EXIT
 PR_HTTP=$(curl -sS -o "$PR_FILE" -w '%{http_code}' -H "$AUTH" \
  "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
 if [ "$PR_HTTP" != "200" ]; then
  echo "::error::GET /pulls/$PR_NUMBER returned HTTP $PR_HTTP (body $(head -c 200 "$PR_FILE"))"
  exit 1
 fi
 HEAD_SHA=$(jq -r '.head.sha' <"$PR_FILE")
 PR_AUTHOR=$(jq -r '.user.login' <"$PR_FILE")
 PR_STATE=$(jq -r '.state' <"$PR_FILE")
 if [ -z "$HEAD_SHA" ] || [ "$HEAD_SHA" = "null" ]; then
  echo "::error::Could not resolve head.sha from PR #$PR_NUMBER response"
  exit 1
 fi
 debug "head_sha=$HEAD_SHA pr_author=$PR_AUTHOR state=$PR_STATE"
 if [ "$PR_STATE" != "open" ]; then
  echo "::notice::PR #$PR_NUMBER state is $PR_STATE; refire is a no-op on closed PRs."
  exit 0
 fi
 # 2. Rate-limit: skip if our context was updated in the last $RATE_LIMIT_SEC.
 # Gitea statuses endpoint returns latest first; we check the most recent
 # entry for our context name.
 if [ "${SOP_REFIRE_DISABLE_RATE_LIMIT:-}" != "1" ]; then
  STATUSES_FILE=$(mktemp)
  trap 'rm -f "$PR_FILE" "$STATUSES_FILE"' EXIT
  ST_HTTP=$(curl -sS -o "$STATUSES_FILE" -w '%{http_code}' -H "$AUTH" \
    "${API}/repos/${OWNER}/${NAME}/statuses/${HEAD_SHA}?limit=50&sort=newest")
  debug "statuses-list HTTP=$ST_HTTP"
  if [ "$ST_HTTP" = "200" ]; then
    LAST_UPDATED=$(jq -r --arg c "$CONTEXT" \
      '[.[] | select(.context == $c)] | first | .updated_at // ""' \
      <"$STATUSES_FILE")
    if [ -n "$LAST_UPDATED" ] && [ "$LAST_UPDATED" != "null" ]; then
      # Parse RFC3339 → epoch. Use python -c for portability (date(1) -d
      # differs between BSD/GNU; the Gitea runner is Ubuntu so GNU date
      # works, but we keep python for future container variance).
      LAST_EPOCH=$(python3 -c "import sys,datetime;print(int(datetime.datetime.fromisoformat(sys.argv[1].replace('Z','+00:00')).timestamp()))" "$LAST_UPDATED" 2>/dev/null || echo "0")
      NOW_EPOCH=$(date -u +%s)
      AGE=$((NOW_EPOCH - LAST_EPOCH))
      debug "last status update: $LAST_UPDATED ($AGE seconds ago)"
      if [ "$AGE" -lt "$RATE_LIMIT_SEC" ] && [ "$AGE" -ge 0 ]; then
        echo "::notice::sop-tier-refire rate-limited — last status update was ${AGE}s ago (<${RATE_LIMIT_SEC}s window). Try again shortly."
        exit 0
      fi
    fi
  fi
 fi
 # 3. Invoke sop-tier-check.sh with the env it expects. Capture exit code.
 # The canonical script reads tier label, walks approving reviewers, and
 # evaluates the AND-composition expression — we want the SAME gate, not
 # a different gate.
 #
 # SOP_REFIRE_TIER_CHECK_SCRIPT env var lets tests substitute a mock —
 # sop-tier-check.sh uses bash 4+ associative arrays which trigger a known
 # bash 3.2 parser bug (`tier: unbound variable` from declare -A with
 # `set -u`). Linux Gitea runners ship bash 4/5 so production is fine;
 # the override exists so the bash 3.2 dev box can still exercise the
 # refire glue logic end-to-end.
 SCRIPT="${SOP_REFIRE_TIER_CHECK_SCRIPT:-$(dirname "$0")/sop-tier-check.sh}"
 if [ ! -f "$SCRIPT" ]; then
  echo "::error::sop-tier-check.sh not found at $SCRIPT — refire requires the canonical script"
  exit 1
 fi
 # Re-invoke. Pipe stdout/stderr through so the runner log shows the
 # tier-check decision inline.
 set +e
 GITEA_TOKEN="$GITEA_TOKEN" \
  GITEA_HOST="$GITEA_HOST" \
  REPO="$REPO" \
  PR_NUMBER="$PR_NUMBER" \
  PR_AUTHOR="$PR_AUTHOR" \
  SOP_DEBUG="${SOP_DEBUG:-0}" \
  SOP_LEGACY_CHECK="${SOP_LEGACY_CHECK:-0}" \
  bash "$SCRIPT"
 TIER_EXIT=$?
 set -e
 debug "sop-tier-check.sh exit=$TIER_EXIT"
 # 4. POST the resulting status.
 if [ "$TIER_EXIT" -eq 0 ]; then
  STATE="success"
  DESCRIPTION="Refired via /refire-tier-check by $COMMENT_AUTHOR"
 else
  STATE="failure"
  DESCRIPTION="Refired via /refire-tier-check; tier-check failed (see workflow log)"
 fi
 # Status target_url points at the runner log so a curious reviewer can
 # follow it back. SERVER_URL + RUN_ID + JOB_ID isn't trivially constructible
 # from the bash env on Gitea 1.22.6, so we point at the PR itself.
 TARGET_URL="https://${GITEA_HOST}/${OWNER}/${NAME}/pulls/${PR_NUMBER}"
 POST_BODY=$(jq -nc \
  --arg state "$STATE" \
  --arg context "$CONTEXT" \
  --arg description "$DESCRIPTION" \
  --arg target_url "$TARGET_URL" \
  '{state:$state, context:$context, description:$description, target_url:$target_url}')
 POST_FILE=$(mktemp)
 trap 'rm -f "$PR_FILE" "${STATUSES_FILE:-}" "$POST_FILE"' EXIT
 POST_HTTP=$(curl -sS -o "$POST_FILE" -w '%{http_code}' \
  -X POST -H "$AUTH" -H "Content-Type: application/json" \
  -d "$POST_BODY" \
  "${API}/repos/${OWNER}/${NAME}/statuses/${HEAD_SHA}")
 if [ "$POST_HTTP" != "200" ] && [ "$POST_HTTP" != "201" ]; then
  echo "::error::POST /statuses/$HEAD_SHA returned HTTP $POST_HTTP (body $(head -c 200 "$POST_FILE"))"
  exit 1
 fi
 echo "::notice::sop-tier-refire posted state=$STATE for context=\"$CONTEXT\" on sha=$HEAD_SHA"
 exit "$TIER_EXIT"
--- a/.gitea/scripts/tests/_mock_tier_check.sh
+++ b/.gitea/scripts/tests/_mock_tier_check.sh
@ -0,0 +1,28 @@
 #!/usr/bin/env bash
 # Mock sop-tier-check.sh for sop-tier-refire tests.
 #
 # Exits 0 ("PASS") if $MOCK_TIER_RESULT == "pass", else exits 1.
 # This lets the refire tests cover the success + failure status-POST
 # paths without invoking the real sop-tier-check.sh (which uses bash 4+
 # associative arrays — known parser bug on macOS bash 3.2 dev box).
 set -euo pipefail
 case "${MOCK_TIER_RESULT:-pass}" in
  pass)
    echo "::notice::mock tier-check: PASS"
    exit 0
    ;;
  fail_no_label)
    echo "::error::mock tier-check: no tier label"
    exit 1
    ;;
  fail_no_approvals)
    echo "::error::mock tier-check: no approving reviews"
    exit 1
    ;;
  *)
    echo "::error::mock tier-check: unknown MOCK_TIER_RESULT=${MOCK_TIER_RESULT:-}"
    exit 2
    ;;
 esac
--- a/.gitea/scripts/tests/_refire_fixture.py
+++ b/.gitea/scripts/tests/_refire_fixture.py
@ -0,0 +1,208 @@
 #!/usr/bin/env python3
 """Stub Gitea API for sop-tier-refire test scenarios.
 Reads $FIXTURE_STATE_DIR/scenario to decide what to return for each
 endpoint the sop-tier-refire.sh + sop-tier-check.sh scripts call.
 Captures every POST to /statuses/{sha} into posted_statuses.jsonl so
 the test can assert what the script tried to write.
 Scenarios:
  T1_success         — tier:low + APPROVED by engineer → tier-check passes
  T2_no_tier_label   — no tier label → tier-check exits 1 before POST
  T3_no_approvals    — tier:low but zero approving reviews → exits 1
  T4_closed          — PR state=closed → refire is a no-op
  T5_rate_limited    — last status update 5 seconds ago → skip
 Usage:
  FIXTURE_STATE_DIR=/tmp/x python3 _refire_fixture.py 8080
 """
 import datetime
 import http.server
 import json
 import os
 import re
 import sys
 import urllib.parse
 STATE_DIR = os.environ["FIXTURE_STATE_DIR"]
 def scenario() -> str:
    p = os.path.join(STATE_DIR, "scenario")
    if not os.path.isfile(p):
        return "T1_success"
    with open(p) as f:
        return f.read().strip()
 def now_iso() -> str:
    return datetime.datetime.now(datetime.timezone.utc).isoformat()
 def append_post(body: dict) -> None:
    with open(os.path.join(STATE_DIR, "posted_statuses.jsonl"), "a") as f:
        f.write(json.dumps(body) + "\n")
 def pr_payload() -> dict:
    sc = scenario()
    state = "closed" if sc == "T4_closed" else "open"
    return {
        "number": 999,
        "state": state,
        "head": {"sha": "deadbeef0000111122223333444455556666"},
        "user": {"login": "feature-author"},
    }
 def labels_payload() -> list:
    sc = scenario()
    if sc == "T2_no_tier_label":
        return [{"name": "bug"}]
    # All other scenarios use tier:low
    return [{"name": "tier:low"}, {"name": "ci"}]
 def reviews_payload() -> list:
    sc = scenario()
    if sc == "T3_no_approvals":
        return []
    # All other scenarios have one APPROVED review by an engineer
    return [
        {
            "state": "APPROVED",
            "user": {"login": "reviewer-engineer"},
        }
    ]
 def teams_payload() -> list:
    # Mirror the real molecule-ai org teams referenced in TIER_EXPR
    return [
        {"id": 5, "name": "ceo"},
        {"id": 2, "name": "engineers"},
        {"id": 6, "name": "managers"},
    ]
 def statuses_payload() -> list:
    sc = scenario()
    if sc == "T5_rate_limited":
        recent = (
            datetime.datetime.now(datetime.timezone.utc)
            - datetime.timedelta(seconds=5)
        ).isoformat()
        return [
            {
                "context": "sop-tier-check / tier-check (pull_request)",
                "state": "failure",
                "updated_at": recent,
            }
        ]
    return []
 def user_payload() -> dict:
    # Mirrors the WHOAMI probe in sop-tier-check.sh
    return {"login": "sop-tier-bot-fixture"}
 class Handler(http.server.BaseHTTPRequestHandler):
    # Quiet — keep stdout for explicit logs only.
    def log_message(self, *args, **kwargs):  # noqa: D401
        pass
    def _json(self, code: int, body) -> None:
        payload = json.dumps(body).encode()
        self.send_response(code)
        self.send_header("Content-Type", "application/json")
        self.send_header("Content-Length", str(len(payload)))
        self.end_headers()
        self.wfile.write(payload)
    def _empty(self, code: int) -> None:
        self.send_response(code)
        self.send_header("Content-Length", "0")
        self.end_headers()
    def do_GET(self):  # noqa: N802
        u = urllib.parse.urlparse(self.path)
        path = u.path
        if path == "/_ping":
            return self._json(200, {"ok": True})
        if path == "/api/v1/user":
            return self._json(200, user_payload())
        # /api/v1/repos/{owner}/{name}/pulls/{n}
        m = re.match(r"^/api/v1/repos/[^/]+/[^/]+/pulls/(\d+)$", path)
        if m:
            return self._json(200, pr_payload())
        # /api/v1/repos/{owner}/{name}/issues/{n}/labels
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/issues/\d+/labels$", path):
            return self._json(200, labels_payload())
        # /api/v1/repos/{owner}/{name}/pulls/{n}/reviews
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/pulls/\d+/reviews$", path):
            return self._json(200, reviews_payload())
        # /api/v1/orgs/{owner}/teams
        if re.match(r"^/api/v1/orgs/[^/]+/teams$", path):
            return self._json(200, teams_payload())
        # /api/v1/teams/{id}/members/{login} → 204 if user is an engineer
        m = re.match(r"^/api/v1/teams/(\d+)/members/([^/]+)$", path)
        if m:
            team_id, login = m.group(1), m.group(2)
            # In our fixture reviewer-engineer ∈ engineers (id=2)
            if team_id == "2" and login == "reviewer-engineer":
                return self._empty(204)
            return self._empty(404)
        # /api/v1/orgs/{owner}/members/{login} — fallback path used when
        # team-member probes all 403. We don't need it for these tests.
        if re.match(r"^/api/v1/orgs/[^/]+/members/[^/]+$", path):
            return self._empty(404)
        # /api/v1/repos/{owner}/{name}/statuses/{sha}
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/statuses/[^/]+$", path):
            return self._json(200, statuses_payload())
        return self._json(404, {"path": path, "msg": "fixture: no route"})
    def do_POST(self):  # noqa: N802
        u = urllib.parse.urlparse(self.path)
        path = u.path
        length = int(self.headers.get("Content-Length") or 0)
        raw = self.rfile.read(length) if length else b""
        try:
            body = json.loads(raw) if raw else {}
        except Exception:
            body = {"_raw": raw.decode(errors="replace")}
        if re.match(r"^/api/v1/repos/[^/]+/[^/]+/statuses/[^/]+$", path):
            append_post(body)
            # Echo back something status-shaped — script only checks HTTP code.
            return self._json(
                201,
                {
                    "context": body.get("context"),
                    "state": body.get("state"),
                    "created_at": now_iso(),
                },
            )
        return self._json(404, {"path": path, "msg": "fixture: no route"})
 def main():
    port = int(sys.argv[1])
    srv = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
    srv.serve_forever()
 if __name__ == "__main__":
    main()
--- a/.gitea/scripts/tests/test_sop_tier_check_clause_split.sh
+++ b/.gitea/scripts/tests/test_sop_tier_check_clause_split.sh
@ -0,0 +1,101 @@
 #!/usr/bin/env bash
 # Regression test for #229 — sop-tier-check tier:low OR-clause splitter.
 #
 # Bug (PR #225 → still broken after PR #231):
 #   Line ~289 of sop-tier-check.sh used:
 #     _clause=$(echo "$_raw_clause" | tr -d '()' | tr ',' '\n' | tr -d '[:space:]' | grep -v '^$')
 #   `tr -d '[:space:]'` strips the newlines that `tr ',' '\n'` just
 #   inserted, collapsing "engineers,managers,ceo" into a single token
 #   "engineersmanagersceo". The for-loop then iterates ONCE on a name
 #   that matches no team, so every tier:low PR fails:
 #     ::error::clause [engineers/managers/ceo]: FAIL — no approving
 #     reviewer belongs to any of these teamsengineersmanagersceo
 #   (note also: missing separators in the error string is bug #2 —
 #    `_clause_names` used "${var:+, }$x" which OVERWRITES per iteration).
 #
 # Fix shape (this PR):
 #   _no_parens=${_raw_clause//[()]/}
 #   _clause=${_no_parens//,/ }    # comma -> space, bash word-split iterates
 #   _clause_names="${_clause_names}${_clause_names:+, }${_t}"  # APPEND, not overwrite
 #
 # This test extracts the splitter logic and asserts it produces the right
 # token list for each of the three tier expressions live in the script.
 set -euo pipefail
 PASS=0
 FAIL=0
 assert_eq() {
  local label="$1"
  local expected="$2"
  local got="$3"
  if [ "$expected" = "$got" ]; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        expected: <$expected>"
    echo "        got:      <$got>"
    FAIL=$((FAIL + 1))
  fi
 }
 # ----- Splitter under test (mirrors the fixed sop-tier-check.sh block) -----
 split_clause() {
  local raw="$1"
  local no_parens=${raw//[()]/}
  local clause=${no_parens//,/ }
  local out=""
  for _t in $clause; do
    out="${out}${out:+|}$_t"
  done
  echo "$out"
 }
 echo "test: tier:low OR-clause splits to 3 tokens"
 assert_eq "tier:low" "engineers|managers|ceo" "$(split_clause "engineers,managers,ceo")"
 echo "test: tier:medium AND-expression — bash word-split on \$EXPR yields 5 tokens"
 EXPR="managers AND engineers AND qa???,security???"
 out=""
 for _raw in $EXPR; do
  out="${out}${out:+ ; }$(split_clause "$_raw")"
 done
 assert_eq "tier:medium" "managers ; AND ; engineers ; AND ; qa???|security???" "$out"
 echo "test: tier:high single-team OR-clause"
 assert_eq "tier:high" "ceo" "$(split_clause "ceo")"
 echo "test: paren-wrapped OR-set unwraps + splits"
 assert_eq "paren OR" "managers|ceo" "$(split_clause "(managers,ceo)")"
 # ----- _clause_names accumulator (was overwriting per iteration) -----
 acc=""
 for t in engineers managers ceo; do
  acc="${acc}${acc:+, }${t}"
 done
 assert_eq "_clause_names append" "engineers, managers, ceo" "$acc"
 # ----- _failed_clauses / _passed_clauses accumulator across raw clauses -----
 acc=""
 for c in clauseA clauseB clauseC; do
  acc="${acc}${acc:+, }${c}"
 done
 assert_eq "_failed_clauses append" "clauseA, clauseB, clauseC" "$acc"
 # ----- End-to-end OR-gate: simulate APPROVER_TEAMS[core-lead]=' managers ' -----
 # The script's case pattern is *${_t}* with a space-padded value.
 APPROVER_TEAMS_VAL=" managers "
 matched=""
 for _t in $(split_clause "engineers,managers,ceo" | tr '|' ' '); do
  case "$APPROVER_TEAMS_VAL" in
    *${_t}*) matched="$_t"; break ;;
  esac
 done
 assert_eq "OR-gate matches managers" "managers" "$matched"
 echo
 echo "------"
 echo "PASS=$PASS FAIL=$FAIL"
 [ "$FAIL" -eq 0 ]
--- a/.gitea/scripts/tests/test_sop_tier_refire.sh
+++ b/.gitea/scripts/tests/test_sop_tier_refire.sh
@ -0,0 +1,297 @@
 #!/usr/bin/env bash
 # Tests for sop-tier-refire.{yml,sh} — internal#292.
 #
 # Behavior matrix:
 #
 #   T1: PR open + APPROVED via tier:low → script invokes sop-tier-check
 #       and POSTs status=success.
 #   T2: PR open + missing tier label → sop-tier-check exits non-zero;
 #       refire POSTs status=failure (description mentions failure).
 #   T3: PR open + tier:low but NO approving reviews → sop-tier-check
 #       exits non-zero; refire POSTs status=failure.
 #   T4: PR CLOSED → refire exits 0 with no status POST (no-op on closed).
 #   T5: Rate-limit — recent status update within 30s → refire skips,
 #       no new POST.
 #   T6 (yaml-lint): workflow `if:` expression contains author_association
 #       gate + slash-command-trigger gate + PR-not-issue gate.
 #   T7 (yaml-lint): workflow file is parseable YAML.
 #
 # Tests T1-T5 run the real script against a local-fixture HTTP server
 # (python http.server with a stub handler — `tests/_refire_fixture.py`)
 # so the script's Gitea API calls hit the fixture, not the real Gitea.
 #
 # Tests T6/T7 are pure YAML checks against the workflow file.
 #
 # Hostile-self-review (per feedback_assert_exact_not_substring):
 # this test MUST FAIL if the workflow or script is absent. Verified by
 # running the test before the files exist (covered in the PR body).
 set -euo pipefail
 THIS_DIR="$(cd "$(dirname "$0")" && pwd)"
 SCRIPT_DIR="$(cd "$THIS_DIR/.." && pwd)"
 WORKFLOW_DIR="$(cd "$THIS_DIR/../../workflows" && pwd)"
 WORKFLOW="$WORKFLOW_DIR/sop-tier-refire.yml"
 SCRIPT="$SCRIPT_DIR/sop-tier-refire.sh"
 PASS=0
 FAIL=0
 FAILED_TESTS=""
 assert_eq() {
  local label="$1"
  local expected="$2"
  local got="$3"
  if [ "$expected" = "$got" ]; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        expected: <$expected>"
    echo "        got:      <$got>"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 assert_contains() {
  local label="$1"
  local needle="$2"
  local haystack="$3"
  if printf '%s' "$haystack" | grep -qF "$needle"; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label"
    echo "        needle:    <$needle>"
    echo "        haystack:  <$(printf '%s' "$haystack" | head -c 400)>"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 assert_file_exists() {
  local label="$1"
  local path="$2"
  if [ -f "$path" ]; then
    echo "  PASS  $label"
    PASS=$((PASS + 1))
  else
    echo "  FAIL  $label (not found: $path)"
    FAIL=$((FAIL + 1))
    FAILED_TESTS="${FAILED_TESTS} ${label}"
  fi
 }
 # Existence (foundation — every other test depends on these)
 echo
 echo "== existence =="
 assert_file_exists "workflow file exists"  "$WORKFLOW"
 assert_file_exists "script file exists"    "$SCRIPT"
 if [ "$FAIL" -gt 0 ]; then
  echo
  echo "------"
  echo "PASS=$PASS FAIL=$FAIL (existence)"
  echo "Cannot proceed without these files."
  exit 1
 fi
 # T6 / T7 — workflow YAML structure
 echo
 echo "== T6/T7 workflow yaml =="
 # YAML parseability
 PARSE_OUT=$(python3 -c 'import sys,yaml;yaml.safe_load(open(sys.argv[1]).read());print("ok")' "$WORKFLOW" 2>&1 || true)
 assert_eq "T7 workflow parses as YAML" "ok" "$PARSE_OUT"
 # Three required gates in the `if:` expression
 WORKFLOW_CONTENT=$(cat "$WORKFLOW")
 assert_contains "T6a workflow if: contains author_association gate" \
  "github.event.comment.author_association" "$WORKFLOW_CONTENT"
 assert_contains "T6b workflow if: gates on MEMBER/OWNER/COLLABORATOR" \
  '["MEMBER","OWNER","COLLABORATOR"]' "$WORKFLOW_CONTENT"
 assert_contains "T6c workflow if: contains slash-command trigger" \
  "/refire-tier-check" "$WORKFLOW_CONTENT"
 assert_contains "T6d workflow if: gates on PR-not-issue" \
  "github.event.issue.pull_request" "$WORKFLOW_CONTENT"
 assert_contains "T6e workflow listens on issue_comment" \
  "issue_comment" "$WORKFLOW_CONTENT"
 assert_contains "T6f workflow requests statuses:write permission" \
  "statuses: write" "$WORKFLOW_CONTENT"
 # Does NOT check out PR HEAD (security)
 if grep -q 'ref: \${{ github.event.pull_request.head' "$WORKFLOW"; then
  echo "  FAIL  T6g workflow MUST NOT check out PR head (security)"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} T6g"
 else
  echo "  PASS  T6g workflow does not check out PR head"
  PASS=$((PASS + 1))
 fi
 # T1-T5 — script behavior against a local Gitea-fixture
 echo
 echo "== T1-T5 script behavior (vs local fixture) =="
 # Spin up the fixture HTTP server.
 FIXTURE_DIR=$(mktemp -d)
 trap 'rm -rf "$FIXTURE_DIR"; [ -n "${FIX_PID:-}" ] && kill "$FIX_PID" 2>/dev/null || true' EXIT
 FIXTURE_PY="$THIS_DIR/_refire_fixture.py"
 if [ ! -f "$FIXTURE_PY" ]; then
  echo "::error::fixture server $FIXTURE_PY missing"
  exit 1
 fi
 FIX_LOG="$FIXTURE_DIR/fixture.log"
 FIX_STATE_DIR="$FIXTURE_DIR/state"
 mkdir -p "$FIX_STATE_DIR"
 # Find an unused port.
 FIX_PORT=$(python3 -c 'import socket;s=socket.socket();s.bind(("127.0.0.1",0));print(s.getsockname()[1]);s.close()')
 FIXTURE_STATE_DIR="$FIX_STATE_DIR" python3 "$FIXTURE_PY" "$FIX_PORT" \
  >"$FIX_LOG" 2>&1 &
 FIX_PID=$!
 # Wait for fixture readiness.
 for _ in $(seq 1 50); do
  if curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
    break
  fi
  sleep 0.1
 done
 if ! curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
  echo "::error::fixture server failed to start. Log:"
  cat "$FIX_LOG"
  exit 1
 fi
 # Helper: set fixture state for a scenario, then run the script.
 # tier_result is one of: pass | fail_no_label | fail_no_approvals.
 # The refire script's tier-check invocation is mocked because the real
 # sop-tier-check.sh uses bash 4+ associative arrays — incompatible with
 # the macOS bash 3.2 dev shell. Linux Gitea runners use bash 4/5 so
 # production runs the real script. The mock exercises the success +
 # failure branches of refire's status-POST glue.
 run_scenario() {
  local scenario="$1"
  local tier_result="${2:-pass}"
  echo "$scenario" >"$FIX_STATE_DIR/scenario"
  : >"$FIX_STATE_DIR/posted_statuses.jsonl"  # clear status log
  local out
  set +e
  out=$(
    PATH="$FIXTURE_DIR/bin:$PATH" \
    GITEA_TOKEN="fixture-token" \
    GITEA_HOST="fixture.local" \
    REPO="molecule-ai/molecule-core" \
    PR_NUMBER="999" \
    COMMENT_AUTHOR="test-runner" \
    SOP_REFIRE_DISABLE_RATE_LIMIT="1" \
    SOP_REFIRE_TIER_CHECK_SCRIPT="$THIS_DIR/_mock_tier_check.sh" \
    MOCK_TIER_RESULT="$tier_result" \
    FIXTURE_PORT="$FIX_PORT" \
    bash "$SCRIPT" 2>&1
  )
  local rc=$?
  set -e
  echo "$out" >"$FIX_STATE_DIR/last_run.log"
  echo "$rc" >"$FIX_STATE_DIR/last_rc"
 }
 # Install a curl shim that rewrites https://fixture.local → http://127.0.0.1:$PORT
 # Use bash prefix-strip (${var#prefix}) — it sidesteps the `/` delimiter
 # confusion of ${var/pattern/replacement}.
 mkdir -p "$FIXTURE_DIR/bin"
 cat >"$FIXTURE_DIR/bin/curl" <<SHIM
 #!/usr/bin/env bash
 # Test shim: rewrite https://fixture.local/* -> http://127.0.0.1:${FIX_PORT}/*
 # The fixture doesn't authenticate; -H Authorization passes through harmlessly.
 new_args=()
 for a in "\$@"; do
  if [[ "\$a" == https://fixture.local/* ]]; then
    rest="\${a#https://fixture.local}"
    a="http://127.0.0.1:${FIX_PORT}\${rest}"
  fi
  new_args+=("\$a")
 done
 exec /usr/bin/curl "\${new_args[@]}"
 SHIM
 chmod +x "$FIXTURE_DIR/bin/curl"
 # T1: tier:low + 1 APPROVED + author is in engineers team → success
 run_scenario "T1_success" "pass"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 assert_eq "T1 exit code 0 (success)" "0" "$RC"
 assert_contains "T1 POSTed state=success" '"state": "success"' "$POSTED"
 assert_contains "T1 POST context is sop-tier-check / tier-check" \
  '"context": "sop-tier-check / tier-check (pull_request)"' "$POSTED"
 assert_contains "T1 description names commenter" "test-runner" "$POSTED"
 # T2: missing tier label → tier-check fails → failure status POSTed
 run_scenario "T2_no_tier_label" "fail_no_label"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 # tier-check.sh exits 1; refire script forwards that exit, so RC != 0
 if [ "$RC" -ne 0 ]; then
  echo "  PASS  T2 exit code non-zero (got $RC)"
  PASS=$((PASS + 1))
 else
  echo "  FAIL  T2 exit code should be non-zero, got 0"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} T2_rc"
 fi
 assert_contains "T2 POSTed state=failure" '"state": "failure"' "$POSTED"
 # T3: tier:low present but ZERO approving reviews → failure
 run_scenario "T3_no_approvals" "fail_no_approvals"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 if [ "$RC" -ne 0 ]; then
  echo "  PASS  T3 exit code non-zero (got $RC)"
  PASS=$((PASS + 1))
 else
  echo "  FAIL  T3 exit code should be non-zero, got 0"
  FAIL=$((FAIL + 1))
  FAILED_TESTS="${FAILED_TESTS} T3_rc"
 fi
 assert_contains "T3 POSTed state=failure" '"state": "failure"' "$POSTED"
 # T4: closed PR — refire is a no-op (no POST, exit 0)
 run_scenario "T4_closed" "pass"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 assert_eq "T4 closed PR exits 0" "0" "$RC"
 assert_eq "T4 closed PR posts no status" "" "$POSTED"
 # T5: rate-limit — disable the env override and let scenario set a
 # recent statuses entry. Re-enable rate-limit for this scenario by NOT
 # passing SOP_REFIRE_DISABLE_RATE_LIMIT.
 echo "T5_rate_limited" >"$FIX_STATE_DIR/scenario"
 : >"$FIX_STATE_DIR/posted_statuses.jsonl"
 set +e
 T5_OUT=$(
  PATH="$FIXTURE_DIR/bin:$PATH" \
  GITEA_TOKEN="fixture-token" \
  GITEA_HOST="fixture.local" \
  REPO="molecule-ai/molecule-core" \
  PR_NUMBER="999" \
  COMMENT_AUTHOR="test-runner" \
  FIXTURE_PORT="$FIX_PORT" \
  bash "$SCRIPT" 2>&1
 )
 T5_RC=$?
 set -e
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
 assert_eq "T5 rate-limited exits 0" "0" "$T5_RC"
 assert_contains "T5 rate-limited log says skipped" "rate-limited" "$T5_OUT"
 assert_eq "T5 rate-limited posts no status" "" "$POSTED"
 echo
 echo "------"
 echo "PASS=$PASS FAIL=$FAIL"
 if [ "$FAIL" -gt 0 ]; then
  echo "Failed:$FAILED_TESTS"
 fi
 [ "$FAIL" -eq 0 ]
--- a/.gitea/workflows/audit-force-merge.yml
+++ b/.gitea/workflows/audit-force-merge.yml
@ -0,0 +1,88 @@
 # audit-force-merge — emit `incident.force_merge` to the runner log when
 # a PR is merged with required-status checks NOT all green. Vector picks
 # the JSON line off docker_logs and ships to Loki on
 # molecule-canonical-obs (per `reference_obs_stack_phase1`); query as:
 #
 #   {host="operator"} |= "event_type" |= "incident.force_merge" | json
 #
 # Companion to `audit-force-merge.sh` (script-extract pattern, same as
 # sop-tier-check). The audit observes BOTH UI-merged and REST-merged PRs
 # uniformly per `feedback_gh_cli_merge_lies_use_rest`.
 #
 # Closes the §SOP-6 audit gap for the molecule-core repo. RFC:
 # internal#219 §6. Mirrors the same-named workflow in
 # molecule-controlplane; design rationale lives in the RFC, not here,
 # to keep the workflow file scannable.
 name: audit-force-merge
 # pull_request_target loads from the base branch — same security model
 # as sop-tier-check. Without this, a PR author could rewrite the
 # workflow on their own PR and skip the audit emission for their own
 # force-merge. The base-branch checkout below ALSO uses
 # `base.sha`, not `base.ref`, so a fast-moving base can't slip a
 # different audit script in under us.
 on:
  pull_request_target:
    types: [closed]
 # `pull-requests: read` + `contents: read` covers everything the script
 # needs (fetch PR + commit statuses). `issues:` deliberately omitted —
 # audit fires-and-forgets to stdout, never opens issues.
 permissions:
  contents: read
  pull-requests: read
 jobs:
  audit:
    runs-on: ubuntu-latest
    # Skip when PR is closed without merge — saves a runner.
    if: github.event.pull_request.merged == true
    steps:
      - name: Check out base branch (for the script)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # base.sha pinning, NOT base.ref — see header rationale.
          ref: ${{ github.event.pull_request.base.sha }}
      - name: Detect force-merge + emit audit event
        env:
          # Same org-level secret the sop-tier-check workflow uses;
          # falls back to the auto-injected GITHUB_TOKEN if the
          # org-level SOP_TIER_CHECK_TOKEN isn't set on a transitional
          # repo.
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          # Required-status-check contexts to evaluate at merge time.
          # Newline-separated. MUST mirror branch protection's
          # status_check_contexts for protected branches
          # (currently `main`; `staging` protection forthcoming per
          # RFC internal#219 Phase 4).
          #
          # Initialized 2026-05-11 from the current molecule-core `main`
          # branch protection:
          #
          #   GET /api/v1/repos/molecule-ai/molecule-core/
          #       branch_protections/main
          #   → status_check_contexts = [
          #       "Secret scan / Scan diff for credential-shaped strings (pull_request)",
          #       "sop-tier-check / tier-check (pull_request)"
          #     ]
          #
          # Declared here rather than fetched from /branch_protections
          # because that endpoint requires admin write — sop-tier-bot
          # is read-only by design (least-privilege per
          # `feedback_least_privilege_via_workflow_env` / internal#257).
          # Drift between this env and the real protection list is
          # auto-detected by `ci-required-drift.yml` (RFC §4 + §6),
          # which opens a `[ci-drift]` issue within one hour.
          #
          # When the protection set changes (e.g. Phase 4 adds the
          # `ci / all-required (pull_request)` sentinel), update BOTH
          # branch protection AND this env in the SAME PR; drift-detect
          # will otherwise file an issue for you.
          REQUIRED_CHECKS: |
            Secret scan / Scan diff for credential-shaped strings (pull_request)
            sop-tier-check / tier-check (pull_request)
        run: bash .gitea/scripts/audit-force-merge.sh
--- a/.gitea/workflows/block-internal-paths.yml
+++ b/.gitea/workflows/block-internal-paths.yml
@ -0,0 +1,148 @@
 name: Block internal-flavored paths
 # Ported from .github/workflows/block-internal-paths.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group: { types: [checks_requested] }` (Gitea has no
 #     merge queue; no `gh-readonly-queue/...` refs).
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on the job (RFC §1 contract — surface
 #     defects without blocking; follow-up PR flips after triage).
 #
 # Hard CI gate. Internal content (positioning, competitive briefs, sales
 # playbooks, PMM/press drip, draft campaigns) lives in molecule-ai/internal —
 # this public monorepo must never re-acquire those paths. CEO directive
 # 2026-04-23 after a fleet-wide audit found 79 internal files leaked here.
 #
 # Failure mode without this gate: agents (PMM, Research, DevRel, Sales) drop
 # briefs into the easiest path their cwd resolves to (root /research,
 # /marketing, /docs/marketing) and gitignore alone won't catch a `git add -f`
 # or a stale gitignore line. This workflow is the mechanical backstop.
 on:
  pull_request:
    types: [opened, synchronize, reopened]
  push:
    branches: [main, staging]
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  check:
    name: Block forbidden paths
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 2  # need previous commit to diff against on push events
      # For pull_request events the diff base is github.event.pull_request.base.sha,
      # which may be many commits behind HEAD and therefore absent from the
      # shallow clone above. Fetch it explicitly (depth=1 keeps it fast).
      - name: Fetch PR base SHA (pull_request events only)
        if: github.event_name == 'pull_request'
        run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
      - name: Refuse if forbidden paths appear
        env:
          # Plumb event-specific SHAs through env so the script doesn't
          # need conditional `${{ ... }}` interpolation per event type.
          # github.event.before/after only exist on push events;
          # pull_request has pull_request.base.sha / pull_request.head.sha.
          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
          PUSH_BEFORE: ${{ github.event.before }}
          PUSH_AFTER: ${{ github.event.after }}
        run: |
          # Paths that must NEVER live in the public monorepo. Add to this
          # list narrowly — broader patterns belong in .gitignore so day-to-day
          # docs work isn't accidentally blocked.
          FORBIDDEN_PATTERNS=(
            "^research/"
            "^marketing/"
            "^docs/marketing/"
            "^comment-[0-9]+\.json$"
            "^test-pmm.*\.(txt|md)$"
            "^tick-reflections.*\.(txt|md)$"
            ".*-temp\.(md|txt)$"
          )
          # Determine the diff base. Each event type stores its SHAs in
          # a different place — see the env block above.
          case "${{ github.event_name }}" in
            pull_request)
              BASE="$PR_BASE_SHA"
              HEAD="$PR_HEAD_SHA"
              ;;
            *)
              BASE="$PUSH_BEFORE"
              HEAD="$PUSH_AFTER"
              ;;
          esac
          # On push events with shallow clones, BASE may be present in
          # the event payload but absent from the local object DB
          # (fetch-depth=2 doesn't always reach the previous commit
          # across true merges). Try fetching it on demand. If the
          # fetch fails — e.g. the SHA was force-overwritten — we fall
          # through to the empty-BASE branch below, which scans the
          # entire tree as if every file were new. Correct, just slow.
          if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
            if ! git cat-file -e "$BASE" 2>/dev/null; then
              git fetch --depth=1 origin "$BASE" 2>/dev/null || true
            fi
          fi
          # Files added or modified in this change.
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
            # New branch / no previous SHA / BASE unreachable — check
            # the entire tree as if every file were new. Slower but
            # correct on first push or post-fetch-failure recovery.
            CHANGED=$(git ls-tree -r --name-only HEAD)
          else
            CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
          fi
          if [ -z "$CHANGED" ]; then
            echo "No changed files to inspect."
            exit 0
          fi
          OFFENDING=""
          for path in $CHANGED; do
            for pattern in "${FORBIDDEN_PATTERNS[@]}"; do
              if echo "$path" | grep -qE "$pattern"; then
                OFFENDING="${OFFENDING}${path} (matched: ${pattern})\n"
                break
              fi
            done
          done
          if [ -n "$OFFENDING" ]; then
            echo "::error::Forbidden internal-flavored paths detected:"
            printf "$OFFENDING"
            echo ""
            echo "These paths belong in molecule-ai/internal, not this public repo."
            echo "See docs/internal-content-policy.md for canonical locations."
            echo ""
            echo "If your file is genuinely public-facing (e.g. a blog post"
            echo "ready to ship), use one of these alternatives instead:"
            echo "  - Public-bound blog posts:  docs/blog/<slug>.md"
            echo "  - Public-bound tutorials:   docs/tutorials/<slug>.md"
            echo "  - Public devrel content:    docs/devrel/<slug>.md"
            echo ""
            echo "If you legitimately need to add a new top-level path that"
            echo "happens to match a forbidden pattern, edit"
            echo ".gitea/workflows/block-internal-paths.yml and update the"
            echo "FORBIDDEN_PATTERNS list with reviewer signoff."
            exit 1
          fi
          echo "OK No forbidden paths in this change."
--- a/.gitea/workflows/cascade-list-drift-gate.yml
+++ b/.gitea/workflows/cascade-list-drift-gate.yml
@ -0,0 +1,58 @@
 name: cascade-list-drift-gate
 # Ported from .github/workflows/cascade-list-drift-gate.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths reference .gitea/workflows/publish-runtime.yml (the active
 #     Gitea workflow file) instead of .github/workflows/publish-runtime.yml
 #     (which Category A of this sweep deletes).
 #   - Explicit `WORKFLOW=` arg passed to the drift script so it audits the
 #     .gitea/ workflow (the script's default is still .github/... which
 #     will not exist post-Cat-A).
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on the job (RFC §1 contract — surface
 #     defects without blocking; follow-up PR flips after triage).
 #
 # Structural gate: TEMPLATES list in publish-runtime.yml must match
 # manifest.json's workspace_templates exactly. Closes the recurrence
 # path of PR #2556 (the data fix) and is the first concrete deliverable
 # of RFC #388 PR-3.
 #
 # Triggers narrowly to keep CI quiet: only on PRs that actually change
 # one of the two files. The path-filtered split + always-emit-result
 # pattern (memory: "Required check names need a job that always runs")
 # is unnecessary here because the workflow IS the check name and PR
 # branch protection should require it directly. Future-proof: if this
 # becomes a required check, add a no-op aggregator with always() so the
 # name still emits when paths don't match.
 on:
  pull_request:
    branches: [staging, main]
    paths:
      - manifest.json
      - .gitea/workflows/publish-runtime.yml
      - scripts/check-cascade-list-vs-manifest.sh
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 permissions:
  contents: read
 jobs:
  check:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
      - name: Check cascade list matches manifest
        # Pass the .gitea/ workflow path explicitly — the script's
        # default still points at .github/... which Category A of this
        # sweep removes.
        run: bash scripts/check-cascade-list-vs-manifest.sh manifest.json .gitea/workflows/publish-runtime.yml
--- a/.gitea/workflows/check-migration-collisions.yml
+++ b/.gitea/workflows/check-migration-collisions.yml
@ -0,0 +1,74 @@
 name: Check migration collisions
 # Ported from .github/workflows/check-migration-collisions.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths includes .gitea/workflows/check-migration-collisions.yml
 #     (this file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL pinned to https://git.moleculesai.app
 #     so scripts/ops/check_migration_collisions.py can derive the Gitea API
 #     base (the script already supports this; see _gitea_api_url()).
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Hard gate (#2341): fails a PR that adds a migration prefix already
 # claimed by the base branch or another open PR. Caught manually 2026-04-30
 # during PR #2276 rebase: 044_runtime_image_pins collided with
 # 044_platform_inbound_secret from RFC #2312. This workflow makes that
 # check automatic.
 #
 # Trigger model: pull_request only — there's no value running this on
 # pushes to staging or main (those are post-merge; the gate must fire
 # pre-merge to be useful). Path filter scopes to PRs that actually touch
 # migrations.
 on:
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
      - 'workspace-server/migrations/**'
      - 'scripts/ops/check_migration_collisions.py'
      - '.gitea/workflows/check-migration-collisions.yml'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 permissions:
  contents: read
  # API needs read access to other PRs to detect cross-PR collisions
  pull-requests: read
 jobs:
  check:
    name: Migration version collision check
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          # Need history to diff against base ref
          fetch-depth: 0
      - name: Detect collisions
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
          BASE_REF: origin/${{ github.event.pull_request.base.ref }}
          HEAD_REF: ${{ github.event.pull_request.head.sha }}
          GITHUB_REPOSITORY: ${{ github.repository }}
          # Auto-injected; Gitea aliases this for in-repo API access.
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          # Ensure the named base ref exists locally. checkout@v4 with
          # fetch-depth=0 pulls full history, but the explicit fetch is
          # cheap insurance against form-of-ref differences across runs.
          #
          # IMPORTANT: do NOT pass --depth=1 here. The script below uses
          # `git diff origin/<base>...<head>` (three-dot, merge-base form),
          # which fails with "fatal: no merge base" if the base ref is
          # shallow.
          git fetch origin "${{ github.event.pull_request.base.ref }}" || true
          python3 scripts/ops/check_migration_collisions.py
--- a/.gitea/workflows/ci-required-drift.yml
+++ b/.gitea/workflows/ci-required-drift.yml
@ -0,0 +1,107 @@
 # ci-required-drift — hourly sentinel for drift between the canonical
 # "what counts as required" sources of truth in this repo:
 #
 #   1. `.gitea/workflows/ci.yml` jobs                       (CI source)
 #   2. `branch_protections/{main,staging}.status_check_contexts`
 #                                                           (protection)
 #   3. `.gitea/workflows/audit-force-merge.yml` REQUIRED_CHECKS env
 #                                                           (audit env)
 #
 # RFC: internal#219 §4 (jobs ↔ protection) + §6 (audit env ↔ protection).
 # Ported verbatim-then-adapted from molecule-controlplane PR#112
 # (SHA 0adf2098) per RFC internal#219 Phase 2b+c — replicate repo-by-repo.
 #
 # When any pair diverges, a `[ci-drift]` issue is opened or updated
 # (idempotent by title) and labelled `tier:high`. This is the
 # auto-detection that closes the regression class identified in
 # RFC §1 finding 3 (protection only listed 2 of 6 real jobs for
 # ~weeks, undetected) and §6 (audit env drifts silently from
 # protection).
 #
 # Diff logic lives in `.gitea/scripts/ci-required-drift.py`. The
 # Python file does YAML AST parsing + `needs:` graph walking per
 # `feedback_behavior_based_ast_gates` — NOT grep-by-name. That way
 # job renames or matrix-expansion-induced churn produce honest signal.
 #
 # IMPORTANT — TRANSITIONAL STATE: molecule-core's ci.yml does NOT yet
 # contain the `all-required` sentinel job (RFC §4 Phase 4 adds it).
 # Until Phase 4 lands the detector will hard-fail with exit 3 on the
 # missing sentinel. That's intentional: a red workflow on a 5-min cron
 # is louder than a silent issue and forces Phase 4 to land soon.
 name: ci-required-drift
 # IMPORTANT — Gitea 1.22.6 parser quirk per
 # `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
 # `inputs:` block here, even though stock GitHub Actions allows it.
 # Gitea 1.22.6 flattens `workflow_dispatch.inputs.X` into a sibling of
 # the `on:` event keys and rejects the entire workflow as
 # "unknown on type". The whole file then registers for ZERO events
 # (no schedule, no dispatch). When Gitea ≥ 1.23 lands fleet-wide,
 # this constraint can be revisited.
 on:
  schedule:
    # Hourly at :17 — offset from :00 to spread load away from the
    # peak when N cron workflows fire on the hour-boundary, per
    # RFC §4 cadence ("off-zero").
    - cron: '17 * * * *'
  workflow_dispatch:
 # Read protection + read CI YAML + write issue. No write on contents.
 permissions:
  contents: read
  issues: write
 # Serialise — two simultaneous drift runs would duel on the issue
 # create/update path. The audit is idempotent, but parallel POSTs
 # can produce duplicate comments before the title-search dedup wins.
 concurrency:
  group: ci-required-drift
  cancel-in-progress: false
 jobs:
  drift:
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - name: Check out repo (we read the YAML files locally)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - name: Set up Python (PyYAML for AST parsing)
        # Avoid a system-pip install on the runner; setup-python pins
        # a hermetic interpreter + cache. PyYAML is small enough that
        # the install is sub-2s — no need to cache wheels.
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Install PyYAML
        run: python -m pip install --quiet 'PyYAML==6.0.2'
      - name: Run drift detector
        env:
          # GITEA_TOKEN reads protection + writes issues. molecule-core
          # uses `SOP_TIER_CHECK_TOKEN` as the org-level secret name for
          # read-only Gitea API access from CI (set by audit-force-merge
          # and sop-tier-check too). Falls back to the auto-injected
          # GITHUB_TOKEN if the org-level secret isn't set
          # (transitional repos).
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          # Branches whose protection we compare against. molecule-core
          # currently has main protected; staging protection is
          # forthcoming. Keep this list in sync if a new long-lived
          # branch gets protected (e.g. release/* if introduced later).
          BRANCHES: 'main staging'
          # The sentinel job's name inside ci.yml. If the aggregator
          # is ever renamed, update this too (the drift detector
          # currently treats `all-required` as the source of "what
          # the sentinel claims to require").
          SENTINEL_JOB: 'all-required'
          # Path to the audit workflow whose REQUIRED_CHECKS env we
          # cross-check against protection (RFC §6).
          AUDIT_WORKFLOW_PATH: '.gitea/workflows/audit-force-merge.yml'
          # Path to the CI workflow with the sentinel + the jobs.
          CI_WORKFLOW_PATH: '.gitea/workflows/ci.yml'
          # Issue label applied on file/update. `tier:high` exists in
          # the molecule-core label set (verified 2026-05-11, label id 9).
          DRIFT_LABEL: 'tier:high'
        run: python3 .gitea/scripts/ci-required-drift.py
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@ -0,0 +1,453 @@
 # Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1.
 # continue-on-error: true on every job; follow-up PR will flip required after
 # surfaced bugs are fixed (per RFC §1 — "surface broken workflows without
 # blocking"). The four-surface migration audit
 # (feedback_gitea_actions_migration_audit_pattern) was performed against this
 # port:
 #
 #   1. YAML — dropped `merge_group` trigger (no Gitea merge queue); no
 #      `workflow_dispatch.inputs` to drop (Gitea 1.22.6 rejects those —
 #      feedback_gitea_workflow_dispatch_inputs_unsupported); no `environment:`
 #      blocks; kept `runs-on: ubuntu-latest` (Gitea runner pool advertises
 #      this label per agent_labels in action_runner table). Workflow-level
 #      env.GITHUB_SERVER_URL set as belt-and-suspenders against runner
 #      defaults (feedback_act_runner_github_server_url).
 #
 #   2. Cache — `actions/upload-artifact@v3.2.2` was already pinned to v3 for
 #      Gitea act_runner v0.6 compatibility (a comment in the original called
 #      this out). v4+ is incompatible with Gitea 1.22.x. No `actions/cache`
 #      usage to audit. `actions/setup-python@v6` `cache: pip` is left in
 #      place — works against Gitea's built-in cache server when runner.cache
 #      is configured (currently is, /opt/molecule/runners/config.yaml).
 #
 #   3. Token — workflow uses no custom dispatch tokens. The auto-injected
 #      `GITHUB_TOKEN` (which Gitea aliases to a runner-scoped token) is
 #      sufficient for `actions/checkout` against this same repo.
 #
 #   4. Docs — no docs/scripts reference github.com URLs that need swapping.
 #      The canvas-deploy-reminder step writes a `ghcr.io/...` image
 #      reference into the step summary text — that's documentation prose
 #      pointing at the ECR-mirrored canvas image and stays unchanged for
 #      this port (a separate cleanup if ghcr→ECR sweep is in scope).
 #
 # Cross-links:
 #   - RFC: internal#219 (CI/CD hard-gate hardening)
 #   - Reference port style: molecule-controlplane/.gitea/workflows/ci.yml
 #   - Bugs that may surface immediately and are tracked separately:
 #     internal#214 (Go-side vanity-import / go.sum drift, if any)
 #   - Phase 4 (this PR's follow-up): flip `continue-on-error: false` once
 #     surfaced defects are fixed, then add `all-required` aggregator
 #     sentinel (RFC §2) and PATCH branch protection (Phase 4 scope).
 name: CI
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
  # `merge_group` (GitHub merge-queue trigger) dropped — Gitea has no merge
  # queue. The .github/ original retains it; this Gitea-side copy drops it.
 # Cancel in-progress CI runs when a new commit arrives on the same ref.
 # Stale runs queue up otherwise. PR refs and main/staging refs each get
 # their own group because github.ref differs.
 concurrency:
  group: ci-${{ github.ref }}
  cancel-in-progress: true
 env:
  # Belt-and-suspenders against the runner-default trap
  # (feedback_act_runner_github_server_url). Runners are configured with
  # this env via /opt/molecule/runners/config.yaml runner.envs, but pinning
  # at the workflow level protects against a runner regenerated without
  # the config file (feedback_act_runner_needs_config_file_env).
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  # Detect which paths changed so downstream jobs can skip when only
  # docs/markdown files were modified.
  changes:
    name: Detect changes
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after the surfaced defects
    # (if any) are triaged.
    continue-on-error: true
    outputs:
      platform: ${{ steps.check.outputs.platform }}
      canvas: ${{ steps.check.outputs.canvas }}
      python: ${{ steps.check.outputs.python }}
      scripts: ${{ steps.check.outputs.scripts }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: check
        run: |
          # For PR events: diff against the base branch (not HEAD~1 of the branch,
          # which may be unrelated after force-pushes). When a push updates a PR,
          # both pull_request and push events fire — prefer the PR base so that
          # the diff is always computed against the actual merge base, not the
          # previous SHA on the branch which may be on a different history line.
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          # GITHUB_BASE_REF is set for PR events (the base branch name).
          # For pull_request events we use the stored base.sha; for push events
          # (or when base.sha is unavailable) fall back to github.event.before.
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          # Fallback: if BASE is empty or all zeros (new branch), run everything
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "platform=true" >> "$GITHUB_OUTPUT"
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            echo "python=true" >> "$GITHUB_OUTPUT"
            echo "scripts=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Both .github/workflows/ci.yml AND .gitea/workflows/ci.yml count
          # as "this workflow changed" — either edit should force-run every
          # downstream job. The Gitea port follows the same shape as the
          # GitHub original so behavior matches when triggered on either
          # platform.
          DIFF=$(git diff --name-only "$BASE" HEAD 2>/dev/null || echo ".gitea/workflows/ci.yml")
          echo "platform=$(echo "$DIFF" | grep -qE '^workspace-server/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
          echo "canvas=$(echo "$DIFF" | grep -qE '^canvas/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
          echo "python=$(echo "$DIFF" | grep -qE '^workspace/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
          echo "scripts=$(echo "$DIFF" | grep -qE '^tests/e2e/|^scripts/|^infra/scripts/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
  # Platform (Go) — Go build/vet/test/lint + coverage gates. The always-run
  # + per-step gating shape preserves the GitHub-side required-check name
  # contract (so when this Gitea port becomes a required check in Phase 4,
  # the name match works on PRs that don't touch workspace-server/).
  platform-build:
    name: Platform (Go)
    needs: changes
    runs-on: ubuntu-latest
    continue-on-error: true
    defaults:
      run:
        working-directory: workspace-server
    steps:
      - if: needs.changes.outputs.platform != 'true'
        working-directory: .
        run: echo "No platform/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.platform == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.platform == 'true'
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
      - if: needs.changes.outputs.platform == 'true'
        run: go mod download
      - if: needs.changes.outputs.platform == 'true'
        run: go build ./cmd/server
      # CLI (molecli) moved to standalone repo: git.moleculesai.app/molecule-ai/molecule-cli
      - if: needs.changes.outputs.platform == 'true'
        run: go vet ./... || true
      - if: needs.changes.outputs.platform == 'true'
        name: Run golangci-lint
        run: golangci-lint run --timeout 3m ./... || true
      - if: needs.changes.outputs.platform == 'true'
        name: Run tests with race detection and coverage
        run: go test -race -coverprofile=coverage.out ./...
      - if: needs.changes.outputs.platform == 'true'
        name: Per-file coverage report
        # Advisory — lists every source file with its coverage so reviewers
        # can see at-a-glance where gaps are. Sorted ascending so the worst
        # offenders float to the top. Does NOT fail the build; the hard
        # gate is the threshold check below. (#1823)
        run: |
          echo "=== Per-file coverage (worst first) ==="
          go tool cover -func=coverage.out \
            | grep -v '^total:' \
            | awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
                   END {for (f in s) printf "%6.1f%%  %s\n", s[f]/c[f], f}' \
            | sort -n
      - if: needs.changes.outputs.platform == 'true'
        name: Check coverage thresholds
        # Enforces two gates from #1823 Layer 1:
        #   1. Total floor (25% — ratchet plan in COVERAGE_FLOOR.md).
        #   2. Per-file floor — non-test .go files in security-critical
        #      paths with coverage <10% fail the build, UNLESS the file
        #      path is listed in .coverage-allowlist.txt (acknowledged
        #      historical debt with a tracking issue + expiry).
        run: |
          set -e
          TOTAL_FLOOR=25
          # Security-critical paths where a 0%-coverage file is a real risk.
          CRITICAL_PATHS=(
            "internal/handlers/tokens"
            "internal/handlers/workspace_provision"
            "internal/handlers/a2a_proxy"
            "internal/handlers/registry"
            "internal/handlers/secrets"
            "internal/middleware/wsauth"
            "internal/crypto"
          )
          TOTAL=$(go tool cover -func=coverage.out | grep '^total:' | awk '{print $3}' | sed 's/%//')
          echo "Total coverage: ${TOTAL}%"
          if awk "BEGIN{exit !($TOTAL < $TOTAL_FLOOR)}"; then
            echo "::error::Total coverage ${TOTAL}% is below the ${TOTAL_FLOOR}% floor. See COVERAGE_FLOOR.md for ratchet plan."
            exit 1
          fi
          # Aggregate per-file coverage → /tmp/perfile.txt: "<fullpath> <pct>"
          go tool cover -func=coverage.out \
            | grep -v '^total:' \
            | awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
                   END {for (f in s) printf "%s %.1f\n", f, s[f]/c[f]}' \
            > /tmp/perfile.txt
          # Build allowlist — paths relative to workspace-server, one per line.
          # Lines starting with # are comments.
          ALLOWLIST=""
          if [ -f ../.coverage-allowlist.txt ]; then
            ALLOWLIST=$(grep -vE '^(#|[[:space:]]*$)' ../.coverage-allowlist.txt || true)
          fi
          FAILED=0
          WARNED=0
          for path in "${CRITICAL_PATHS[@]}"; do
            while read -r file pct; do
              [[ "$file" == *_test.go ]] && continue
              [[ "$file" == *"$path"* ]] || continue
              awk "BEGIN{exit !($pct < 10)}" || continue
              # Strip the package-import prefix so we can match .coverage-allowlist.txt
              # entries written as paths relative to workspace-server/.
              # Handle both module paths: platform/workspace-server/... and platform/...
              rel=$(echo "$file" | sed 's|^github.com/molecule-ai/molecule-monorepo/platform/workspace-server/||; s|^github.com/molecule-ai/molecule-monorepo/platform/||')
              if echo "$ALLOWLIST" | grep -qxF "$rel"; then
                echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry."
                WARNED=$((WARNED+1))
              else
                echo "::error file=workspace-server/$rel::Critical file at ${pct}% coverage — must be >=10% (target 80%). See #1823. To acknowledge as known debt, add this path to .coverage-allowlist.txt."
                FAILED=$((FAILED+1))
              fi
            done < /tmp/perfile.txt
          done
          echo ""
          echo "Critical-path check: $FAILED new failures, $WARNED allowlisted warnings."
          if [ "$FAILED" -gt 0 ]; then
            echo ""
            echo "$FAILED security-critical file(s) have <10% test coverage and are"
            echo "NOT in the allowlist. These paths handle auth, tokens, secrets, or"
            echo "workspace provisioning — a 0% file here is the exact gap that let"
            echo "CWE-22, CWE-78, KI-005 slip through in past incidents. Either:"
            echo "  (a) add tests to raise coverage above 10%, or"
            echo "  (b) add the path to .coverage-allowlist.txt with an expiry date"
            echo "      and a tracking issue reference."
            exit 1
          fi
  # Canvas (Next.js) — required check, always runs. Same always-run +
  # per-step gating shape as platform-build. The two-job-sharing-name
  # pattern attempted in PR #2321 doesn't satisfy branch protection
  # (SKIPPED siblings count as not-passed regardless of SUCCESS
  # siblings — verified empirically on PR #2314).
  canvas-build:
    name: Canvas (Next.js)
    needs: changes
    runs-on: ubuntu-latest
    continue-on-error: true
    defaults:
      run:
        working-directory: canvas
    steps:
      - if: needs.changes.outputs.canvas != 'true'
        working-directory: .
        run: echo "No canvas/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.canvas == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.canvas == 'true'
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '22'
      - if: needs.changes.outputs.canvas == 'true'
        run: rm -f package-lock.json && npm install
      - if: needs.changes.outputs.canvas == 'true'
        run: npm run build
      - if: needs.changes.outputs.canvas == 'true'
        name: Run tests with coverage
        # Coverage instrumentation is configured in canvas/vitest.config.ts
        # (provider: v8, reporters: text + html + json-summary). Step 2 of
        # #1815 — wires coverage into CI so we get a baseline visible on
        # every PR. No threshold gate yet; thresholds dial in (Step 3, also
        # tracked in #1815) after the team sees what current coverage is.
        run: npx vitest run --coverage
      - name: Upload coverage summary as artifact
        if: needs.changes.outputs.canvas == 'true' && always()
        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
        # implement, surfacing as `GHESNotSupportedError: @actions/artifact
        # v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not
        # currently supported on GHES`. Drop this pin when Gitea ships
        # the v4 protocol (tracked: post-Gitea-1.23 followup).
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: canvas-coverage-${{ github.run_id }}
          path: canvas/coverage/
          retention-days: 7
          if-no-files-found: warn
  # Shellcheck (E2E scripts) — required check, always runs.
  shellcheck:
    name: Shellcheck (E2E scripts)
    needs: changes
    runs-on: ubuntu-latest
    continue-on-error: true
    steps:
      - if: needs.changes.outputs.scripts != 'true'
        run: echo "No tests/e2e/ or infra/scripts/ changes — skipping real shellcheck; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.scripts == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.scripts == 'true'
        name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh
        # shellcheck is pre-installed on ubuntu-latest runners (via apt).
        # infra/scripts/ is included because setup.sh + nuke.sh gate the
        # README quickstart — a shellcheck regression there silently breaks
        # new-user onboarding. scripts/ is intentionally excluded until its
        # pre-existing SC3040/SC3043 warnings are cleaned up.
        run: |
          find tests/e2e infra/scripts -type f -name '*.sh' -print0 \
            | xargs -0 shellcheck --severity=warning
      - if: needs.changes.outputs.scripts == 'true'
        name: Lint cleanup-trap hygiene (RFC #2873)
        run: bash tests/e2e/lint_cleanup_traps.sh
      - if: needs.changes.outputs.scripts == 'true'
        name: Run E2E bash unit tests (no live infra)
        run: |
          bash tests/e2e/test_model_slug.sh
  canvas-deploy-reminder:
    name: Canvas Deploy Reminder
    runs-on: ubuntu-latest
    continue-on-error: true
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
    if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
    steps:
      - name: Write deploy reminder to step summary
        env:
          COMMIT_SHA: ${{ github.sha }}
          # github.server_url resolves via the workflow-level env override
          # to the Gitea instance, so the RUN_URL points at the Gitea run
          # page (not github.com). See feedback_act_runner_github_server_url.
          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
        run: |
          # Write body to a temp file — avoids backtick escaping in shell.
          cat > /tmp/deploy-reminder.md << 'BODY'
          ## Canvas build passed — deploy required
          The `publish-canvas-image` workflow is now building a fresh Docker image
          (`ghcr.io/molecule-ai/canvas:latest`) in the background.
          Once it completes (~3–5 min), apply on the host machine with:
          ```bash
          cd <runner-workspace>
          git pull origin main
          docker compose pull canvas && docker compose up -d canvas
          ```
          If you need to rebuild from local source instead (e.g. testing unreleased
          changes or a new `NEXT_PUBLIC_*` URL), use:
          ```bash
          docker compose build canvas && docker compose up -d canvas
          ```
          BODY
          printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
            "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
          # Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY,
          # which both GitHub Actions and Gitea Actions render as the
          # workflow run's summary page. (#75 / PR-D)
          cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
  # Python Lint & Test — required check, always runs.
  python-lint:
    name: Python Lint & Test
    needs: changes
    runs-on: ubuntu-latest
    continue-on-error: true
    env:
      WORKSPACE_ID: test
    defaults:
      run:
        working-directory: workspace
    steps:
      - if: needs.changes.outputs.python != 'true'
        working-directory: .
        run: echo "No workspace/** changes — skipping real lint+test; this job always runs to satisfy the required-check name on branch protection."
      - if: needs.changes.outputs.python == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.changes.outputs.python == 'true'
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - if: needs.changes.outputs.python == 'true'
        run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0
      # Coverage flags + fail-under floor moved into workspace/pytest.ini
      # (issue #1817) so local `pytest` and CI use identical config.
      - if: needs.changes.outputs.python == 'true'
        run: python -m pytest --tb=short
      - if: needs.changes.outputs.python == 'true'
        name: Per-file critical-path coverage (MCP / inbox / auth)
        # MCP-critical Python files have a per-file floor on top of the
        # 86% total floor in pytest.ini. See issue #2790 for full rationale.
        run: |
          set -e
          PER_FILE_FLOOR=75
          CRITICAL_FILES=(
            "a2a_mcp_server.py"
            "mcp_cli.py"
            "a2a_tools.py"
            "a2a_tools_inbox.py"
            "inbox.py"
            "platform_auth.py"
          )
          # pytest already wrote .coverage; emit a JSON view scoped to
          # the critical files so jq/python can read the per-file pct
          # without parsing tabular text.
          INCLUDES=$(printf '*%s,' "${CRITICAL_FILES[@]}")
          INCLUDES="${INCLUDES%,}"
          python -m coverage json -o /tmp/critical-cov.json --include="$INCLUDES"
          FAILED=0
          for f in "${CRITICAL_FILES[@]}"; do
            pct=$(jq -r --arg f "$f" '.files | to_entries | map(select(.key == $f)) | .[0].value.summary.percent_covered // "MISSING"' /tmp/critical-cov.json)
            if [ "$pct" = "MISSING" ]; then
              echo "::error file=workspace/$f::No coverage data — file may have moved or test exclusion mis-set."
              FAILED=$((FAILED+1))
              continue
            fi
            echo "$f: ${pct}%"
            if awk "BEGIN{exit !($pct < $PER_FILE_FLOOR)}"; then
              echo "::error file=workspace/$f::${pct}% < ${PER_FILE_FLOOR}% per-file floor (MCP critical path). See COVERAGE_FLOOR.md."
              FAILED=$((FAILED+1))
            fi
          done
          if [ "$FAILED" -gt 0 ]; then
            echo ""
            echo "$FAILED MCP critical-path file(s) below the ${PER_FILE_FLOOR}% per-file floor."
            echo "These paths handle multi-tenant routing, auth tokens, and inbox dispatch."
            echo "A coverage drop here is the same risk shape as Go-side tokens/secrets files"
            echo "dropping below 10% (see COVERAGE_FLOOR.md). Either:"
            echo "  (a) add tests to raise coverage back above ${PER_FILE_FLOOR}%, or"
            echo "  (b) if this is unavoidable historical debt, file an issue and propose"
            echo "      adjusting the floor with rationale in COVERAGE_FLOOR.md."
            exit 1
          fi
--- a/.gitea/workflows/continuous-synth-e2e.yml
+++ b/.gitea/workflows/continuous-synth-e2e.yml
@ -0,0 +1,255 @@
 name: Continuous synthetic E2E (staging)
 # Ported from .github/workflows/continuous-synth-e2e.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Hard gate (#2342): cron-driven full-lifecycle E2E that catches
 # regressions visible only at runtime — schema drift, deployment-pipeline
 # gaps, vendor outages, env-var rotations, DNS / CF / Railway side-effects.
 #
 # Why this gate exists:
 #   PR-time CI catches code-level regressions but not deployment-time or
 #   integration-time ones. Today's empirical data:
 #     • #2345 (A2A v0.2 silent drop) — passed all unit tests, broke at
 #       JSON-RPC parse layer between sender and receiver. Visible only
 #       to a sender exercising the full path.
 #     • RFC #2312 chat upload — landed on staging-branch but never
 #       reached staging tenants because publish-workspace-server-image
 #       was main-only. Caught by manual dogfooding hours after deploy.
 #   Both would have surfaced within 15-20 min of regression if a
 #   continuous synth-E2E was running.
 #
 # Cadence: every 20 min (3x/hour). The script is conservatively
 # bounded at 10 min wall-clock; even on degraded staging it should
 # finish before the next firing. cron-overlap is guarded by the
 # concurrency group below.
 #
 # Cost: ~3 runs/hour × 5-10 min × $0.008/min GHA = ~$0.50-$1/day.
 # Plus a fresh tenant provisioned + torn down each run (Railway +
 # AWS pennies). Negligible.
 #
 # Failure handling: when the run fails, the workflow exits non-zero
 # and GitHub's standard email/notification path fires. Operators
 # can subscribe to this workflow's failure channel for paging-grade
 # alerting.
 on:
  schedule:
    # Every 10 minutes, on :02 :12 :22 :32 :42 :52. Three constraints:
    #   1. Stay off the top-of-hour. GitHub Actions scheduler drops
    #      :00 firings under high load (own docs:
    #      https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule).
    #      Prior history: cron was '0,20,40' (2026-05-02) — only :00
    #      ever survived. Bumped to '10,30,50' (2026-05-03) on the
    #      theory that further-from-:00 wins. Empirically 2026-05-04
    #      that ALSO dropped to ~60 min effective cadence (only ~1
    #      schedule fire per hour — see molecule-core#2726). Detection
    #      latency was claimed 20 min, actual 60 min.
    #   2. Avoid colliding with the existing :15 sweep-cf-orphans
    #      and :45 sweep-cf-tunnels — both hit the CF API and we
    #      don't want to fight for rate-limit tokens.
    #   3. Avoid the :30 heavy slot (staging-smoke /30, sweep-aws-
    #      secrets, sweep-stale-e2e-orgs every :15) — multiple
    #      overlapping cron registrations on the same minute is part
    #      of what GH drops under load.
    # Solution: bump fires-per-hour 3 → 6 AND keep all slots in clean
    # lanes (1-3 min away from any other cron). Even with empirically-
    # observed ~67% GH drop ratio, 6 attempts/hour yields ~2 effective
    # fires = ~30 min cadence; closer to the 20-min target than the
    # current shape and provides a real degradation alarm if drops
    # get worse.
    - cron: '2,12,22,32,42,52 * * * *'
 permissions:
  contents: read
  # No issue-write here — failures surface as red runs in the workflow
  # history. If you want auto-issue-on-fail, add a follow-up step that
  # uses gh issue create gated on `if: failure()`. Keeping the surface
  # minimal until that's actually wanted.
 # Serialize so two firings can never overlap. Cron firing every 20 min
 # but scripts conservatively bounded at 10 min — overlap shouldn't
 # happen in steady state, but if a run hangs we don't want N more
 # stacking up.
 concurrency:
  group: continuous-synth-e2e
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  synth:
    name: Synthetic E2E against staging
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
    # (apt-get update + install docker.io/jq/awscli/caddy + snap install
    # ssm-agent) runs from raw Ubuntu on every boot — none of it is
    # pre-baked into the tenant AMI. Empirical fetch_secrets/ok timing
    # across today's canaries: 51s → 82s → 143s → 625s. apt-mirror tail
    # latency drives the boot-to-fetch_secrets phase from ~1min to >10min.
    # A 12min budget leaves only ~2min for the workspace (which needs
    # ~3.5min for claude-code cold boot) on slow-apt days, blowing the
    # budget. 20min absorbs the worst tenant tail so the workspace probe
    # gets the full ~7min it needs even on a slow apt day. Real fix:
    # pre-bake caddy + ssm-agent into the tenant AMI (controlplane#TBD).
    timeout-minutes: 20
    env:
      # claude-code default: cold-start ~5 min (comparable to langgraph),
      # but uses MiniMax-M2.7-highspeed via the template's third-party-
      # Anthropic-compat path (workspace-configs-templates/claude-code-
      # default/config.yaml:64-69). MiniMax is ~5-10x cheaper than
      # gpt-4.1-mini per token AND avoids the recurring OpenAI quota-
      # exhaustion class that took the canary down 2026-05-03 (#265).
      # Operators can pick langgraph / hermes via workflow_dispatch
      # when they specifically need to exercise the OpenAI or SDK-
      # native paths.
      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
      # Pin the canary to a specific MiniMax model rather than relying
      # on the per-runtime default ("sonnet" → routes to direct
      # Anthropic, defeats the cost saving). Operators can override
      # via workflow_dispatch by setting a different E2E_MODEL_SLUG
      # input if they need to exercise a specific model. M2.7-highspeed
      # is "Token Plan only" but cheap-per-token and fast.
      E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7-highspeed' }}
      # Bound to 10 min so a stuck provision fails the run instead of
      # holding up the next cron firing. 15-min default in the script
      # is for the on-PR full lifecycle where we have more headroom.
      E2E_PROVISION_TIMEOUT_SECS: '600'
      # Slug suffix — namespaced "synth-" so these runs are
      # distinguishable from PR-driven runs in CP admin.
      E2E_RUN_ID: synth-${{ github.run_id }}
      # Forced false for cron; respected for manual dispatch
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
      MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # MiniMax key is the canary's PRIMARY auth path. claude-code
      # template's `minimax` provider routes ANTHROPIC_BASE_URL to
      # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot.
      # tests/e2e/test_staging_full_saas.sh branches SECRETS_JSON on
      # which key is present — MiniMax wins when set.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      # Direct-Anthropic alternative for operators who don't want to
      # set up a MiniMax account (priority below MiniMax — first
      # non-empty wins in test_staging_full_saas.sh's secrets-injection
      # block). See #2578 PR comment for the rationale.
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      # OpenAI fallback — kept wired so operators can dispatch with
      # E2E_RUNTIME=langgraph or =hermes and still have a working
      # canary path. The script picks the right blob shape based on
      # which key is non-empty.
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        run: |
          # Hard-fail on missing secret REGARDLESS of trigger. Previously
          # this step soft-skipped on workflow_dispatch via `exit 0`, but
          # `exit 0` only ends the STEP — subsequent steps still ran with
          # the empty secret, the synth script fell through to the wrong
          # SECRETS_JSON branch, and the canary failed 5 min later with a
          # confusing "Agent error (Exception)" instead of the clean
          # "secret missing" message at the top. Caught 2026-05-04 by
          # dispatched run 25296530706: claude-code + missing MINIMAX
          # silently used OpenAI keys but kept model=MiniMax-M2.7, then
          # the workspace 401'd against MiniMax once it tried to call.
          # Fix: exit 1 in both cron and dispatch paths. Operators who
          # want to verify a YAML change without setting up the secret
          # can read the verify-secrets step's stderr — the failure is
          # itself the verification signal.
          if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — synth E2E cannot run"
            echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
            exit 1
          fi
          # LLM-key requirement is per-runtime: claude-code accepts
          # EITHER MiniMax OR direct-Anthropic (whichever is set first),
          # langgraph + hermes use OpenAI (MOLECULE_STAGING_OPENAI_API_KEY).
          case "${E2E_RUNTIME}" in
            claude-code)
              if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
                required_secret_value="${E2E_MINIMAX_API_KEY}"
              elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value="${E2E_ANTHROPIC_API_KEY}"
              else
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value=""
              fi
              ;;
            langgraph|hermes)
              required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
              required_secret_value="${E2E_OPENAI_API_KEY:-}"
              ;;
            *)
              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
              required_secret_name=""
              required_secret_value="present"
              ;;
          esac
          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
            echo "::error::${required_secret_name} secret missing — runtime=${E2E_RUNTIME} cannot authenticate against its LLM provider"
            echo "::error::Set it at Settings → Secrets and Variables → Actions, OR dispatch with a different runtime"
            exit 1
          fi
      - name: Install required tools
        run: |
          # The script depends on jq + curl (already on ubuntu-latest)
          # and python3 (likewise). Verify they're all present so we
          # fail fast on a runner image regression rather than mid-script.
          for cmd in jq curl python3; do
            command -v "$cmd" >/dev/null 2>&1 || {
              echo "::error::required tool '$cmd' not on PATH — runner image regression?"
              exit 1
            }
          done
      - name: Run synthetic E2E
        # The script handles its own teardown via EXIT trap; even on
        # failure (timeout, assertion), the org is deprovisioned and
        # leaks are reported. Exit code propagates from the script.
        run: |
          bash tests/e2e/test_staging_full_saas.sh
      - name: Failure summary
        # Runs only on failure. Adds a job summary so the workflow run
        # page shows a quick "what happened" instead of forcing readers
        # to scroll through script output.
        if: failure()
        run: |
          {
            echo "## Continuous synth E2E failed"
            echo ""
            echo "**Run ID:** ${{ github.run_id }}"
            echo "**Trigger:** ${{ github.event_name }}"
            echo "**Runtime:** ${E2E_RUNTIME}"
            echo "**Slug:** synth-${{ github.run_id }}"
            echo ""
            echo "### What this means"
            echo ""
            echo "Staging just regressed on a path that previously worked. Likely classes:"
            echo "- Schema mismatch between sender and receiver (#2345 class)"
            echo "- Deployment-pipeline gap (RFC #2312 / staging-tenant-image-stale class)"
            echo "- Vendor outage (Cloudflare, Railway, AWS, GHCR)"
            echo "- Staging-CP env var rotation"
            echo ""
            echo "### Next steps"
            echo ""
            echo "1. Check the script output above for the assertion that failed"
            echo "2. If it's a vendor outage, no action needed — next firing in ~20 min"
            echo "3. If it's a code regression, find the causing PR via \`git log\` against last green run and revert/fix"
            echo "4. Keep an eye on the next 1-2 firings — flake vs persistent fail differs in priority"
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.gitea/workflows/e2e-api.yml
+++ b/.gitea/workflows/e2e-api.yml
@ -0,0 +1,333 @@
 name: E2E API Smoke Test
 # Ported from .github/workflows/e2e-api.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Extracted from ci.yml so workflow-level concurrency can protect this job
 # from run-level cancellation (issue #458).
 #
 # Trigger model (revised 2026-04-29):
 #
 # Always FIRES on push/pull_request to staging+main. Real work is gated
 # per-step on `needs.detect-changes.outputs.api` — when paths under
 # `workspace-server/`, `tests/e2e/`, or this workflow file haven't
 # changed, the no-op step alone runs and emits SUCCESS for the
 # `E2E API Smoke Test` check, satisfying branch protection without
 # spending CI cycles. See the in-job comment on the `e2e-api` job for
 # why this is one job (not two-jobs-sharing-name) and the 2026-04-29
 # PR #2264 incident that drove the consolidation.
 #
 # Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
 # -------------------------------------------------------------------
 # Same substrate hazard as PR #98 (handlers-postgres-integration). Our
 # Gitea act_runner runs with `container.network: host` (operator host
 # `/opt/molecule/runners/config.yaml`), which means:
 #
 #   * Two concurrent runs both try to bind their `-p 15432:5432` /
 #     `-p 16379:6379` host ports — the second postgres/redis FATALs
 #     with `Address in use` and `docker run` returns exit 125 with
 #     `Conflict. The container name "/molecule-ci-postgres" is already
 #     in use by container ...`. Verified in run a7/2727 on 2026-05-07.
 #   * The fixed container names `molecule-ci-postgres` / `-redis` (the
 #     pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
 #     `docker rm -f` at the start of the second job KILLS the first
 #     job's still-running postgres/redis.
 #
 # Fix shape (mirrors PR #98's bridge-net pattern, adapted because
 # platform-server is a Go binary on the host, not a containerised
 # step):
 #
 #   1. Unique container names per run:
 #         pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
 #         redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
 #      `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
 #      same run_id.
 #   2. Ephemeral host port per run (`-p 0:5432`), then read the actual
 #      bound port via `docker port` and export DATABASE_URL/REDIS_URL
 #      pointing at it. No fixed host-port → no port collision.
 #   3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
 #      the original flake fixed in #92 and the script's still IPv6-
 #      enabled.
 #   4. `if: always()` cleanup so containers don't leak when test steps
 #      fail.
 #
 # Issue #94 items #2 + #3 (also fixed here):
 #   * Pre-pull `alpine:latest` so the platform-server's provisioner
 #     (`internal/handlers/container_files.go`) can stand up its
 #     ephemeral token-write helper without a daemon.io round-trip.
 #   * Create `molecule-core-net` bridge network if missing so the
 #     provisioner's container.HostConfig {NetworkMode: ...} attach
 #     succeeds.
 # Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
 # 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
 # they DO come up. Timeouts are not the bottleneck; not bumped.
 #
 # Item explicitly NOT fixed here: failing test `Status back online`
 # fails because the platform's langgraph workspace template image
 # (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
 # 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
 # template-registry resolution issue (ADR-002 / local-build mode) and
 # belongs in a separate change that touches workspace-server, not
 # this workflow file.
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
 concurrency:
  # Per-SHA grouping (changed 2026-04-28 from per-ref). Per-ref had the
  # same auto-promote-staging brittleness as e2e-staging-canvas — back-
  # to-back staging pushes share refs/heads/staging, so the older push's
  # queued run gets cancelled when a newer push lands. Auto-promote-
  # staging then sees `completed/cancelled` for the older SHA and stays
  # put; the newer SHA's gates may eventually save the day, but if the
  # newer push gets cancelled too, we deadlock.
  #
  # See e2e-staging-canvas.yml's identical concurrency block for the full
  # rationale and the 2026-04-28 incident reference.
  group: e2e-api-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      api: ${{ steps.decide.outputs.api }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: decide
        # Inline replacement for dorny/paths-filter — same pattern PR#372's
        # ci.yml port used. Diffs against the PR base or push BEFORE SHA,
        # then matches against the api-relevant path set.
        run: |
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "api=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "api=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(workspace-server/|tests/e2e/|\.gitea/workflows/e2e-api\.yml$)'; then
            echo "api=true" >> "$GITHUB_OUTPUT"
          else
            echo "api=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job (no job-level `if:`) that always runs and reports under the
  # required-check name `E2E API Smoke Test`. Real work is gated per-step
  # on `needs.detect-changes.outputs.api`. Reason: GitHub registers a
  # check run for every job that matches `name:`, and a job-level
  # `if: false` produces a SKIPPED check run. Branch protection treats
  # all check runs with a matching context name on the latest commit as a
  # SET — any SKIPPED in the set fails the required-check eval, even with
  # SUCCESS siblings. Verified 2026-04-29 on PR #2264 (staging→main):
  # 4 check runs (2 SKIPPED + 2 SUCCESS) at the head SHA blocked
  # promotion despite all real work succeeding. Collapsing to a single
  # always-running job with conditional steps emits exactly one SUCCESS
  # check run regardless of paths filter — branch-protection-clean.
  e2e-api:
    needs: detect-changes
    name: E2E API Smoke Test
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 15
    env:
      # Unique per-run container names so concurrent runs on the host-
      # network act_runner don't collide on name OR port.
      # `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
      # same run_id. PORT is set later (after docker port lookup) since
      # we let Docker assign an ephemeral host port.
      PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      PORT: "8080"
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.api != 'true'
        run: |
          echo "No workspace-server / tests/e2e / workflow changes — E2E API gate satisfied without running tests."
          echo "::notice::E2E API Smoke Test no-op pass (paths filter excluded this commit)."
      - if: needs.detect-changes.outputs.api == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.detect-changes.outputs.api == 'true'
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
          cache: true
          cache-dependency-path: workspace-server/go.sum
      - name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          # Provisioner uses alpine:latest for ephemeral token-write
          # containers (workspace-server/internal/handlers/container_files.go).
          # Pre-pull so the first provision in test_api.sh doesn't race
          # the daemon's pull cache. Idempotent — `docker pull` is a no-op
          # when the image is already present.
          docker pull alpine:latest >/dev/null
          # Provisioner attaches workspace containers to
          # molecule-core-net (workspace-server/internal/provisioner/
          # provisioner.go::DefaultNetwork). The bridge already exists on
          # the operator host's docker daemon — `network create` is
          # idempotent via `|| true`.
          docker network create molecule-core-net >/dev/null 2>&1 || true
          echo "alpine:latest pre-pulled; molecule-core-net ensured."
      - name: Start Postgres (docker)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          # Defensive cleanup — only matches THIS run's container name,
          # so it cannot kill a sibling run's postgres. (Pre-fix the
          # name was static and this rm hit other runs' containers.)
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          # `-p 0:5432` requests an ephemeral host port; we read it back
          # below and export DATABASE_URL.
          docker run -d --name "$PG_CONTAINER" \
            -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
            -p 0:5432 postgres:16 >/dev/null
          # Resolve the host-side port assignment. `docker port` prints
          # `0.0.0.0:NNNN` (and on host-net runners may also print an
          # IPv6 line — take the first IPv4 line).
          PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          if [ -z "$PG_PORT" ]; then
            # Fallback: any first line. Some Docker versions print only
            # one line.
            PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
          fi
          if [ -z "$PG_PORT" ]; then
            echo "::error::Could not resolve host port for $PG_CONTAINER"
            docker port "$PG_CONTAINER" 5432/tcp || true
            docker logs "$PG_CONTAINER" || true
            exit 1
          fi
          # 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
          echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
          echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
          echo "Postgres host port: ${PG_PORT}"
          for i in $(seq 1 30); do
            if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
              echo "Postgres ready after ${i}s"
              exit 0
            fi
            sleep 1
          done
          echo "::error::Postgres did not become ready in 30s"
          docker logs "$PG_CONTAINER" || true
          exit 1
      - name: Start Redis (docker)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
          docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
          REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          if [ -z "$REDIS_PORT" ]; then
            REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
          fi
          if [ -z "$REDIS_PORT" ]; then
            echo "::error::Could not resolve host port for $REDIS_CONTAINER"
            docker port "$REDIS_CONTAINER" 6379/tcp || true
            docker logs "$REDIS_CONTAINER" || true
            exit 1
          fi
          echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
          echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
          echo "Redis host port: ${REDIS_PORT}"
          for i in $(seq 1 15); do
            if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
              echo "Redis ready after ${i}s"
              exit 0
            fi
            sleep 1
          done
          echo "::error::Redis did not become ready in 15s"
          docker logs "$REDIS_CONTAINER" || true
          exit 1
      - name: Build platform
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: go build -o platform-server ./cmd/server
      - name: Start platform (background)
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: |
          # DATABASE_URL + REDIS_URL exported by the start-postgres /
          # start-redis steps point at this run's per-run host ports.
          ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid
      - name: Wait for /health
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          for i in $(seq 1 30); do
            if curl -sf http://127.0.0.1:8080/health > /dev/null; then
              echo "Platform up after ${i}s"
              exit 0
            fi
            sleep 1
          done
          echo "::error::Platform did not become healthy in 30s"
          cat workspace-server/platform.log || true
          exit 1
      - name: Assert migrations applied
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'")
          if [ "$tables" != "1" ]; then
            echo "::error::Migrations did not apply"
            cat workspace-server/platform.log || true
            exit 1
          fi
          echo "Migrations OK"
      - name: Run E2E API tests
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_api.sh
      - name: Run notify-with-attachments E2E
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_notify_attachments_e2e.sh
      - name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_priority_runtimes_e2e.sh
      - name: Run poll-mode + since_id cursor E2E (#2339)
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_poll_mode_e2e.sh
      - name: Run poll-mode chat upload E2E (RFC #2891)
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_poll_mode_chat_upload_e2e.sh
      - name: Dump platform log on failure
        if: failure() && needs.detect-changes.outputs.api == 'true'
        run: cat workspace-server/platform.log || true
      - name: Stop platform
        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          if [ -f workspace-server/platform.pid ]; then
            kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
          fi
      - name: Stop service containers
        # always() so containers don't leak when test steps fail. The
        # cleanup is best-effort: if the container is already gone
        # (e.g. concurrent rerun race), don't fail the job.
        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
--- a/.gitea/workflows/e2e-staging-canvas.yml
+++ b/.gitea/workflows/e2e-staging-canvas.yml
@ -0,0 +1,250 @@
 name: E2E Staging Canvas (Playwright)
 # Ported from .github/workflows/e2e-staging-canvas.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Playwright test suite that provisions a fresh staging org per run and
 # verifies every workspace-panel tab renders without crashing. Complements
 # e2e-staging-saas.yml (which tests the API shape) by exercising the
 # actual browser + canvas bundle against live staging.
 #
 # Triggers: push to main/staging or PR touching canvas sources + this workflow,
 # manual dispatch, and weekly cron to catch browser/runtime drift even
 # when canvas is quiet.
 # Added staging to push/pull_request branches so the auto-promote gate
 # check (--event push --branch staging) can see a completed run for this
 # workflow — mirrors what PR #1891 does for e2e-api.yml.
 on:
  # Trigger model (revised 2026-04-29):
  #
  # Always fires on push/pull_request; real work is gated per-step on
  # `needs.detect-changes.outputs.canvas`. When canvas/ paths haven't
  # changed, the no-op step alone runs and emits SUCCESS for the
  # `Canvas tabs E2E` check, satisfying branch protection without
  # spending CI cycles. See e2e-api.yml for the rationale on why this
  # is a single job rather than two-jobs-sharing-name.
  push:
    branches: [main]
  pull_request:
    branches: [main]
  schedule:
    # Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
    # release-note-shaped regressions that don't ride in with a PR.
    - cron: '0 8 * * 0'
 concurrency:
  # Per-SHA grouping (changed 2026-04-28 from a single global group). The
  # global group made auto-promote-staging brittle: when a staging push
  # queued behind an in-flight run and a third entrant (a PR run, a
  # follow-on push) entered the group, the staging push got cancelled —
  # leaving auto-promote-staging looking at `completed/cancelled` for a
  # required gate and refusing to advance main. Observed 2026-04-28
  # 23:51-23:53 on staging tip 3f99fede.
  #
  # The original intent of the global group was to throttle parallel
  # E2E provisions (each spins a fresh EC2). At our scale that throttle
  # isn't worth the correctness cost — fresh-org-per-run isolates the
  # state, and the cost of two parallel runs (~$0.001/min × 10min × 2)
  # is rounding error vs. the cost of a stuck pipeline.
  #
  # Per-SHA still dedupes accidental double-triggers for the SAME SHA.
  # It does NOT cancel obsolete-PR-version runs on force-push; that
  # wasted CI is acceptable given the alternative is losing staging-tip
  # data that auto-promote-staging needs.
  group: e2e-staging-canvas-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      canvas: ${{ steps.decide.outputs.canvas }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: decide
        # Inline replacement for dorny/paths-filter — see e2e-api.yml.
        # Cron triggers always run real work (no diff context).
        run: |
          if [ "${{ github.event_name }}" = "schedule" ]; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(canvas/|\.gitea/workflows/e2e-staging-canvas\.yml$)'; then
            echo "canvas=true" >> "$GITHUB_OUTPUT"
          else
            echo "canvas=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job (no job-level `if:`) that always runs and reports under the
  # required-check name `Canvas tabs E2E`. Real work is gated per-step on
  # `needs.detect-changes.outputs.canvas`. See e2e-api.yml for the full
  # rationale — same path-filter check-name parity issue blocked PR #2264
  # (staging→main) on 2026-04-29 because branch protection treats matching-
  # name check runs as a SET, and any SKIPPED member fails the eval.
  playwright:
    needs: detect-changes
    name: Canvas tabs E2E
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 40
    env:
      CANVAS_E2E_STAGING: '1'
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
    defaults:
      run:
        working-directory: canvas
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.canvas != 'true'
        working-directory: .
        run: |
          echo "No canvas / workflow changes — E2E Staging Canvas gate satisfied without running tests."
          echo "::notice::E2E Staging Canvas no-op pass (paths filter excluded this commit)."
      - if: needs.detect-changes.outputs.canvas == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        if: needs.detect-changes.outputs.canvas == 'true'
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::Missing CP_STAGING_ADMIN_API_TOKEN"
            exit 2
          fi
      - name: Set up Node
        if: needs.detect-changes.outputs.canvas == 'true'
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '20'
          cache: 'npm'
          cache-dependency-path: canvas/package-lock.json
      - name: Install canvas deps
        if: needs.detect-changes.outputs.canvas == 'true'
        run: npm ci
      - name: Install Playwright browsers
        if: needs.detect-changes.outputs.canvas == 'true'
        run: npx playwright install --with-deps chromium
      - name: Run staging canvas E2E
        if: needs.detect-changes.outputs.canvas == 'true'
        run: npx playwright test --config=playwright.staging.config.ts
      - name: Upload Playwright report on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
        # implement (see ci.yml upload step for the canonical error
        # cite). Drop this pin when Gitea ships the v4 protocol.
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: playwright-report-staging
          path: canvas/playwright-report-staging/
          retention-days: 14
      - name: Upload screenshots on failure
        if: failure() && needs.detect-changes.outputs.canvas == 'true'
        # Pinned to v3 for Gitea act_runner v0.6 compatibility (see above).
        uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
        with:
          name: playwright-screenshots
          path: canvas/test-results/
          retention-days: 14
      # Safety-net teardown — fires only when Playwright's globalTeardown
      # didn't (worker crash, runner cancel). Reads the slug from
      # canvas/.playwright-staging-state.json (written by staging-setup
      # as its first action, before any CP call) and deletes only that
      # slug.
      #
      # Earlier versions of this step pattern-swept `e2e-canvas-<today>-*`
      # orgs to compensate for setup-crash-before-state-file-write. That
      # over-aggressive cleanup raced concurrent canvas-E2E runs and
      # poisoned each other's tenants — observed 2026-04-30 when three
      # real-test runs killed each other mid-test, surfacing as
      # `getaddrinfo ENOTFOUND` once CP had cleaned up the just-deleted
      # DNS record. Pattern-sweep removed; setup now writes the state
      # file before any CP work, so the slug is always recoverable.
      - name: Teardown safety net
        if: always() && needs.detect-changes.outputs.canvas == 'true'
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          STATE_FILE=".playwright-staging-state.json"
          if [ ! -f "$STATE_FILE" ]; then
            echo "::notice::No state file at canvas/$STATE_FILE — Playwright globalTeardown handled it (or setup never ran)."
            exit 0
          fi
          slug=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('slug',''))")
          if [ -z "$slug" ]; then
            echo "::warning::State file present but slug missing; nothing to clean up."
            exit 0
          fi
          echo "Deleting orphan tenant: $slug"
          # Verify HTTP 2xx instead of `>/dev/null || true` swallowing
          # failures. A 5xx or timeout previously looked identical to
          # success, leaving the tenant alive for up to ~45 min until
          # sweep-stale-e2e-orgs caught it. Surface failures as
          # workflow warnings naming the slug. Don't `exit 1` — a single
          # cleanup miss shouldn't fail-flag the canvas test when the
          # actual smoke check passed; the sweeper is the safety net.
          # See molecule-controlplane#420.
          # Tempfile-routed -w + set +e/-e prevents curl-exit-code
          # pollution of the captured status (lint-curl-status-capture.yml).
          set +e
          curl -sS -o /tmp/canvas-cleanup.out -w "%{http_code}" \
            -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
            -H "Authorization: Bearer $ADMIN_TOKEN" \
            -H "Content-Type: application/json" \
            -d "{\"confirm\":\"$slug\"}" >/tmp/canvas-cleanup.code
          set -e
          code=$(cat /tmp/canvas-cleanup.code 2>/dev/null || echo "000")
          if [ "$code" = "200" ] || [ "$code" = "204" ]; then
            echo "[teardown] deleted $slug (HTTP $code)"
          else
            echo "::warning::canvas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canvas-cleanup.out 2>/dev/null)"
          fi
          exit 0
--- a/.gitea/workflows/e2e-staging-external.yml
+++ b/.gitea/workflows/e2e-staging-external.yml
@ -0,0 +1,192 @@
 name: E2E Staging External Runtime
 # Ported from .github/workflows/e2e-staging-external.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Regression for the four/five workspaces.status=awaiting_agent transitions
 # that silently failed in production for five days before migration 046
 # extended the workspace_status enum (see
 # workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql).
 #
 # Why this is its own workflow (not folded into e2e-staging-saas.yml):
 #   - The full-saas harness defaults to runtime=hermes, never exercises
 #     external-runtime. Adding an `external` parameter to that script
 #     would force every push to staging through both lifecycles in
 #     series, doubling the EC2 cold-start budget.
 #   - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER
 #     window, 90s default + sweep interval), which we wait through
 #     deliberately. Folding it into hermes would make the long path
 #     even longer.
 #   - It can run in parallel with the hermes E2E since both create
 #     fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs
 #     `e2e-...`).
 #
 # Triggers:
 #   - Push to staging when any source affecting external runtime,
 #     hibernation, or the migration set changes.
 #   - PR review for the same set.
 #   - Manual workflow_dispatch.
 #   - Daily cron at 07:30 UTC (catches drift on quiet days; staggered
 #     30 min after e2e-staging-saas.yml's 07:00 UTC cron).
 #
 # Concurrency: serialized so two staging pushes don't fight for the
 # same EC2 quota window. cancel-in-progress=false so a half-rolled
 # tenant always finishes its teardown.
 on:
  push:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/workspace.go'
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_restart.go'
      - 'workspace-server/internal/registry/healthsweep.go'
      - 'workspace-server/internal/registry/liveness.go'
      - 'workspace-server/migrations/**'
      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
      - 'tests/e2e/test_staging_external_runtime.sh'
      - '.gitea/workflows/e2e-staging-external.yml'
  pull_request:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/workspace.go'
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_restart.go'
      - 'workspace-server/internal/registry/healthsweep.go'
      - 'workspace-server/internal/registry/liveness.go'
      - 'workspace-server/migrations/**'
      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
      - 'tests/e2e/test_staging_external_runtime.sh'
      - '.gitea/workflows/e2e-staging-external.yml'
  schedule:
    - cron: '30 7 * * *'
 concurrency:
  group: e2e-staging-external
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  e2e-staging-external:
    name: E2E Staging External Runtime
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 25
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
      E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            # Schedule + push triggers must hard-fail when the token is
            # missing — silent skip would mask infra rot. Manual dispatch
            # gets the same hard-fail; an operator running this on a fork
            # without secrets configured needs to know up-front.
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
            exit 2
          fi
          echo "Admin token present ✓"
      - name: CP staging health preflight
        run: |
          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
          if [ "$code" != "200" ]; then
            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
            exit 1
          fi
          echo "Staging CP healthy ✓"
      - name: Run external-runtime E2E
        id: e2e
        run: bash tests/e2e/test_staging_external_runtime.sh
      # Mirror the e2e-staging-saas.yml safety net: if the runner is
      # cancelled (e.g. concurrent staging push), the test script's
      # EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to
      # *this* run id.
      - name: Teardown safety net (runs on cancel/failure)
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys, os, datetime
          run_id = os.environ.get('GITHUB_RUN_ID', '')
          d = json.load(sys.stdin)
          # Scope STRICTLY to this run id (e2e-ext-YYYYMMDD-<runid>-...)
          # so concurrent runs and unrelated dev probes are not touched.
          # Sweep today AND yesterday so a midnight-crossing run still
          # cleans up its own slug.
          today = datetime.date.today()
          yesterday = today - datetime.timedelta(days=1)
          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
          if not run_id:
              # Without a run id we cannot scope safely; bail rather
              # than risk deleting unrelated tenants.
              sys.exit(0)
          prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates)
          for o in d.get('orgs', []):
              s = o.get('slug', '')
              if s.startswith(prefixes) and o.get('status') != 'purged':
                  print(s)
          " 2>/dev/null)
          if [ -n "$orgs" ]; then
            echo "Safety-net sweep: deleting leftover orgs:"
            echo "$orgs"
            # Per-slug verified DELETE — see molecule-controlplane#420.
            # `>/dev/null 2>&1` previously hid every failure; surface
            # non-2xx as workflow warnings so the run page names what
            # leaked. Sweeper catches the rest within ~45 min.
            leaks=()
            for slug in $orgs; do
              # Tempfile-routed -w + set +e/-e prevents curl-exit-code
              # pollution of the captured status (lint-curl-status-capture.yml).
              set +e
              curl -sS -o /tmp/external-cleanup.out -w "%{http_code}" \
                -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
                -H "Authorization: Bearer $ADMIN_TOKEN" \
                -H "Content-Type: application/json" \
                -d "{\"confirm\":\"$slug\"}" >/tmp/external-cleanup.code
              set -e
              code=$(cat /tmp/external-cleanup.code 2>/dev/null || echo "000")
              if [ "$code" = "200" ] || [ "$code" = "204" ]; then
                echo "[teardown] deleted $slug (HTTP $code)"
              else
                echo "::warning::external teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/external-cleanup.out 2>/dev/null)"
                leaks+=("$slug")
              fi
            done
            if [ ${#leaks[@]} -gt 0 ]; then
              echo "::warning::external teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
            fi
          else
            echo "Safety-net sweep: no leftover orgs to clean."
          fi
--- a/.gitea/workflows/e2e-staging-saas.yml
+++ b/.gitea/workflows/e2e-staging-saas.yml
@ -0,0 +1,254 @@
 name: E2E Staging SaaS (full lifecycle)
 # Ported from .github/workflows/e2e-staging-saas.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Dedicated workflow that provisions a fresh staging org per run, exercises
 # the full workspace lifecycle (register → heartbeat → A2A → delegation →
 # HMA memory → activity → peers), then tears down and asserts leak-free.
 #
 # Why a separate workflow (not folded into ci.yml):
 #   - The run takes ~25-35 min (EC2 boot + cloudflared DNS + provision sweeps +
 #     agent bootstrap), way too slow for every PR.
 #   - Needs its own concurrency group so two pushes don't fight over the
 #     same staging org slug prefix.
 #   - Has its own required secrets (session cookie, admin token) that most
 #     PRs don't need to read.
 #
 # Triggers:
 #   - Push to main (regression guard)
 #   - workflow_dispatch (manual re-run from UI)
 #   - Nightly cron (catches drift even when no pushes land)
 #   - Changes to any provisioning-critical file under PR review (opt-in
 #     via the same paths watcher that e2e-api.yml uses)
 on:
  # Trunk-based (Phase 3 of internal#81): main is the only branch.
  # Previously this fired on staging push too because staging was a
  # superset of main and ran the gate ahead of auto-promote; with no
  # staging branch, main is where E2E gates the deploy.
  push:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_provision.go'
      - 'workspace-server/internal/handlers/a2a_proxy.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/provisioner/**'
      - 'tests/e2e/test_staging_full_saas.sh'
      - '.gitea/workflows/e2e-staging-saas.yml'
  pull_request:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace_provision.go'
      - 'workspace-server/internal/handlers/a2a_proxy.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/provisioner/**'
      - 'tests/e2e/test_staging_full_saas.sh'
      - '.gitea/workflows/e2e-staging-saas.yml'
  schedule:
    # 07:00 UTC every day — catches AMI drift, WorkOS cert rotation,
    # Cloudflare API regressions, etc. even on quiet days.
    - cron: '0 7 * * *'
 # Serialize: staging has a finite per-hour org creation quota. Two pushes
 # landing in quick succession should queue, not race. `cancel-in-progress:
 # false` mirrors e2e-api.yml — GitHub would otherwise cancel the running
 # teardown step and leave orphan EC2s.
 concurrency:
  group: e2e-staging-saas
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  e2e-staging-saas:
    name: E2E Staging SaaS
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 45
    permissions:
      contents: read
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # Single admin-bearer secret drives provision + tenant-token
      # retrieval + teardown. Configure in
      # Settings → Secrets and variables → Actions → Repository secrets.
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched
      # from hermes+OpenAI default after #2578 (the staging OpenAI key
      # account went over quota and stayed dead for 36+ hours, taking
      # the full-lifecycle E2E red on every provisioning-critical push).
      # claude-code template's `minimax` provider routes
      # ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads
      # MINIMAX_API_KEY at boot — separate billing account so an
      # OpenAI quota collapse no longer wedges the gate. Mirrors the
      # staging-smoke.yml + continuous-synth-e2e.yml migrations.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      # Direct-Anthropic alternative for operators who don't want to
      # set up a MiniMax account (priority below MiniMax — first
      # non-empty wins in test_staging_full_saas.sh's secrets-injection
      # block). See #2578 PR comment for the rationale.
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      # OpenAI fallback — kept wired so an operator-dispatched run with
      # E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still
      # exercise the OpenAI path.
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
      # Pin the model when running on the default claude-code path —
      # the per-runtime default ("sonnet") routes to direct Anthropic
      # and defeats the cost saving. Operators can override via the
      # workflow_dispatch flow (no input wired here yet — runtime
      # override is enough for ad-hoc).
      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
            exit 2
          fi
          echo "Admin token present ✓"
      - name: Verify LLM key present
        run: |
          # Per-runtime key check — claude-code uses MiniMax; hermes /
          # langgraph (operator-dispatched only) use OpenAI. Hard-fail
          # rather than soft-skip per #2578's lesson — empty key
          # silently falls through to the wrong SECRETS_JSON branch and
          # produces a confusing auth error 5 min later instead of the
          # clean "secret missing" message at the top.
          case "${E2E_RUNTIME}" in
            claude-code)
              # Either MiniMax OR direct-Anthropic works — first
              # non-empty wins in the test script's secrets-injection
              # priority chain.
              if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
                required_secret_value="${E2E_MINIMAX_API_KEY}"
              elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value="${E2E_ANTHROPIC_API_KEY}"
              else
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value=""
              fi
              ;;
            langgraph|hermes)
              required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
              required_secret_value="${E2E_OPENAI_API_KEY:-}"
              ;;
            *)
              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
              required_secret_name=""
              required_secret_value="present"
              ;;
          esac
          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
            echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — workspaces will fail at boot with 'No provider API key found'"
            exit 2
          fi
          echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
      - name: CP staging health preflight
        run: |
          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
          if [ "$code" != "200" ]; then
            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
            exit 1
          fi
          echo "Staging CP healthy ✓"
      - name: Run full-lifecycle E2E
        id: e2e
        run: bash tests/e2e/test_staging_full_saas.sh
      # Belt-and-braces teardown: the test script itself installs a trap
      # for EXIT/INT/TERM, but if the GH runner itself is cancelled (e.g.
      # someone pushes a new commit and workflow concurrency is set to
      # cancel), the trap may not fire. This `always()` step runs even on
      # cancellation and attempts the delete a second time. The admin
      # DELETE endpoint is idempotent so double-invoking is safe.
      - name: Teardown safety net (runs on cancel/failure)
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          # Best-effort: find any e2e-YYYYMMDD-* orgs matching this run and
          # nuke them. Catches the case where the script died before
          # exporting its slug.
          set +e
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys, os, datetime
          run_id = os.environ.get('GITHUB_RUN_ID', '')
          d = json.load(sys.stdin)
          # ONLY sweep slugs from *this* CI run. Previously the filter was
          # f'e2e-{today}-' which stomped on parallel CI runs AND any manual
          # E2E probes a dev was running against staging (incident 2026-04-21
          # 15:02Z: this workflow's safety net deleted an unrelated manual
          # run's tenant 1s after it hit 'running').
          # Sweep both today AND yesterday's UTC dates so a run that crosses
          # midnight still matches its own slug — see the 2026-04-26→27
          # canvas-safety-net incident for the same bug class.
          today = datetime.date.today()
          yesterday = today - datetime.timedelta(days=1)
          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
          if run_id:
              prefixes = tuple(f'e2e-{d}-{run_id}-' for d in dates)
          else:
              prefixes = tuple(f'e2e-{d}-' for d in dates)
          candidates = [o['slug'] for o in d.get('orgs', [])
                        if any(o.get('slug','').startswith(p) for p in prefixes)
                        and o.get('instance_status') not in ('purged',)]
          print('\n'.join(candidates))
          " 2>/dev/null)
          # Per-slug verified DELETE (was `>/dev/null || true` — see
          # molecule-controlplane#420). Surface non-2xx as a workflow
          # warning naming the leaked slug; don't exit 1 (sweeper is
          # the safety net within ~45 min).
          leaks=()
          for slug in $orgs; do
            echo "Safety-net teardown: $slug"
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/saas-cleanup.out -w "%{http_code}" \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/saas-cleanup.code
            set -e
            code=$(cat /tmp/saas-cleanup.code 2>/dev/null || echo "000")
            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
              echo "[teardown] deleted $slug (HTTP $code)"
            else
              echo "::warning::saas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/saas-cleanup.out 2>/dev/null)"
              leaks+=("$slug")
            fi
          done
          if [ ${#leaks[@]} -gt 0 ]; then
            echo "::warning::saas teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
          fi
          exit 0
--- a/.gitea/workflows/e2e-staging-sanity.yml
+++ b/.gitea/workflows/e2e-staging-sanity.yml
@ -0,0 +1,166 @@
 name: E2E Staging Sanity (leak-detection self-check)
 # Ported from .github/workflows/e2e-staging-sanity.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `workflow_dispatch:` (Gitea 1.22.6 finicky on bare dispatch).
 #   - `actions/github-script@v9` issue-open block replaced with curl
 #     calls to the Gitea REST API (/api/v1/repos/.../issues|comments).
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Periodic assertion that the teardown safety nets in e2e-staging-saas
 # and staging-smoke (formerly canary-staging) actually work. Runs the
 # E2E harness with E2E_INTENTIONAL_FAILURE=1, which poisons the tenant
 # admin token after the org is provisioned. The workspace-provision
 # step then fails, the script exits non-zero, and the EXIT trap +
 # workflow always()-step must still tear down cleanly.
 on:
  schedule:
    - cron: '0 6 * * 1'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: e2e-staging-sanity
  cancel-in-progress: false
 permissions:
  issues: write
  contents: read
 jobs:
  sanity:
    name: Intentional-failure teardown sanity
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 20
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      E2E_MODE: smoke
      E2E_RUNTIME: hermes
      E2E_RUN_ID: "sanity-${{ github.run_id }}"
      E2E_INTENTIONAL_FAILURE: "1"
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
            exit 2
          fi
      # Inverted assertion: the run MUST fail. If it passes, the
      # E2E_INTENTIONAL_FAILURE path is broken.
      - name: Run harness — expecting exit !=0
        id: harness
        run: |
          set +e
          bash tests/e2e/test_staging_full_saas.sh
          rc=$?
          echo "harness_rc=$rc" >> "$GITHUB_OUTPUT"
          if [ "$rc" = "1" ]; then
            echo "OK Harness failed as expected (rc=1); teardown trap ran, leak-check passed"
            exit 0
          elif [ "$rc" = "0" ]; then
            echo "::error::Harness succeeded under E2E_INTENTIONAL_FAILURE=1 — the poisoning path is broken"
            exit 1
          elif [ "$rc" = "4" ]; then
            echo "::error::LEAK DETECTED (rc=4) — teardown failed to clean up the org. Safety net broken."
            exit 4
          else
            echo "::error::Unexpected rc=$rc — neither clean-failure nor leak. Investigate harness."
            exit 1
          fi
      - name: Open issue if safety net is broken (Gitea API)
        if: failure()
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          TITLE="E2E teardown safety net broken"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          BODY_JSON=$(jq -nc --arg t "$TITLE" --arg run "$RUN_URL" '
            {title: $t,
             body: ("The weekly sanity run (E2E_INTENTIONAL_FAILURE=1) did not exit as expected. This means one of:\n  - poisoning did not actually cause failure (test harness regression), OR\n  - teardown left an orphan org (leak detection caught a real bug)\n\nRun: " + $run + "\n\nThis is higher priority than a canary failure — the whole E2E safety net cannot be trusted until this is resolved.")}')
          EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
          if [ -n "$EXISTING" ]; then
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${EXISTING}/comments" \
              -d "$(jq -nc --arg run "$RUN_URL" '{body: ("Still broken. " + $run)}')" >/dev/null
            echo "Commented on existing issue #${EXISTING}"
          else
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues" -d "$BODY_JSON" >/dev/null
            echo "Filed new issue"
          fi
      # Belt-and-braces: if teardown left anything behind, nuke it here
      # so we don't bleed staging quota.
      - name: Teardown safety net
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys
          d = json.load(sys.stdin)
          today = __import__('datetime').date.today().strftime('%Y%m%d')
          # Match both the new e2e-smoke- prefix (post-2026-05-11 rename)
          # and the legacy e2e-canary- prefix for one rollout cycle so
          # any in-flight org provisioned under the old prefix on an
          # older runner checkout still gets cleaned up. Remove the
          # canary fallback after one week of no-old-prefix observations.
          prefixes = (f'e2e-smoke-{today}-sanity-', f'e2e-canary-{today}-sanity-')
          candidates = [o['slug'] for o in d.get('orgs', [])
                        if any(o.get('slug','').startswith(p) for p in prefixes)
                        and o.get('status') not in ('purged',)]
          print('\n'.join(candidates))
          " 2>/dev/null)
          leaks=()
          for slug in $orgs; do
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/sanity-cleanup.out -w "%{http_code}" \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/sanity-cleanup.code
            set -e
            code=$(cat /tmp/sanity-cleanup.code 2>/dev/null || echo "000")
            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
              echo "[teardown] deleted $slug (HTTP $code)"
            else
              echo "::warning::sanity teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/sanity-cleanup.out 2>/dev/null)"
              leaks+=("$slug")
            fi
          done
          if [ ${#leaks[@]} -gt 0 ]; then
            echo "::warning::sanity teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
          fi
          exit 0
--- a/.gitea/workflows/gate-check-v3.yml
+++ b/.gitea/workflows/gate-check-v3.yml
@ -0,0 +1,91 @@
 # gate-check-v3 — automated PR gate detector
 #
 # Runs on every open PR (push/synchronize) and hourly via cron.
 # Posts a structured [gate-check-v3] STATUS: comment on the PR.
 #
 # Inputs:
 #   PR_NUMBER  — set via ${{ github.event.pull_request.number }} from the trigger
 #   POST_COMMENT — "true" to post/update comment on PR
 #
 # Gating logic (MVP signals 1,2,3,6):
 #   1. Author-aware agent-tag comment scan
 #   2. REQUEST_CHANGES reviews state machine
 #   3. Staleness detection (SOP-12: review.commit_id != PR.head_sha + >1 working day)
 #   6. CI required-checks awareness
 #
 # Exit code: 0=CLEAR, 1=BLOCKED, 2=ERROR
 name: gate-check-v3
 on:
  pull_request_target:
    types: [opened, edited, synchronize, reopened]
  schedule:
    # Hourly: refresh all open PRs
    - cron: '8 * * * *'
  workflow_dispatch:
    inputs:
      pr_number:
        description: 'PR number to check (omit for all open PRs)'
        required: false
        type: string
      post_comment:
        description: 'Post comment on PR'
        required: false
        type: string
        default: 'true'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  gate-check:
    runs-on: ubuntu-latest
    continue-on-error: true  # Never block on our own detector failing
    steps:
      - name: Check out base branch (for the script)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ github.event.pull_request.base.sha || github.ref_name }}
      - name: Run gate-check-v3 (single PR mode)
        if: github.event_name == 'pull_request_target' || github.event.inputs.pr_number != ''
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          PR_NUMBER: ${{ github.event.pull_request.number || github.event.inputs.pr_number }}
          POST_COMMENT: ${{ github.event.inputs.post_comment || 'true' }}
        run: |
          set -euo pipefail
          python3 tools/gate-check-v3/gate_check.py \
            --repo "${{ github.repository }}" \
            --pr "$PR_NUMBER" \
            $([ "$POST_COMMENT" = "true" ] && echo "--post-comment")
          echo "verdict=$?" >> "$GITHUB_OUTPUT"
      - name: Run gate-check-v3 (all open PRs — cron mode)
        if: github.event_name == 'schedule'
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
        run: |
          set -euo pipefail
          # Fetch all open PRs and run gate-check on each
          pr_numbers=$(python3 -c "
            import urllib.request, json, os
            token = os.environ['GITEA_TOKEN']
            req = urllib.request.Request(
                'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
                headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
            )
            with urllib.request.urlopen(req) as r:
                prs = json.loads(r.read())
            for pr in prs:
                print(pr['number'])
          ")
          for pr in $pr_numbers; do
            echo "Checking PR #$pr..."
            python3 tools/gate-check-v3/gate_check.py \
              --repo "${{ github.repository }}" \
              --pr "$pr" \
              --post-comment \
              || true
          done
--- a/.gitea/workflows/handlers-postgres-integration.yml
+++ b/.gitea/workflows/handlers-postgres-integration.yml
@ -0,0 +1,282 @@
 name: Handlers Postgres Integration
 # Ported from .github/workflows/handlers-postgres-integration.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Real-Postgres integration tests for workspace-server/internal/handlers/.
 # Triggered on every PR/push that touches the handlers package.
 #
 # Why this workflow exists
 # ------------------------
 # Strict-sqlmock unit tests pin which SQL statements fire — they're fast
 # and let us iterate without a DB. But sqlmock CANNOT detect bugs that
 # depend on the row state AFTER the SQL runs. The result_preview-lost
 # bug shipped to staging in PR #2854 because every unit test was
 # satisfied with "an UPDATE statement fired" — none verified the row's
 # preview field actually landed. The local-postgres E2E that retrofit
 # self-review caught it took 2 minutes to set up and would have caught
 # the bug at PR-time.
 #
 # Why this workflow does NOT use `services: postgres:` (Class B fix)
 # ------------------------------------------------------------------
 # Our act_runner config has `container.network: host` (operator host
 # /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
 # the job container AND every service container. With host-net, two
 # concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
 # second postgres FATALs with `could not create any TCP/IP sockets:
 # Address in use`, and Docker auto-removes it (act_runner sets
 # AutoRemove:true on service containers). By the time the migrations
 # step runs `psql`, the postgres container is gone, hence
 # `Connection refused` then `failed to remove container: No such
 # container` at cleanup time.
 #
 # Per-job `container.network` override is silently ignored by
 # act_runner — `--network and --net in the options will be ignored.`
 # appears in the runner log. Documented constraint.
 #
 # So we sidestep `services:` entirely. The job container still uses
 # host-net (inherited from runner config; required for cache server
 # discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
 # postgres on the existing `molecule-core-net` bridge with a
 # UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
 # read its bridge IP via `docker inspect`. A host-net job container
 # can reach a bridge-net container directly via the bridge IP (verified
 # manually on operator host 2026-05-08).
 #
 # Trade-offs vs. the original `services:` shape:
 #   + No host-port collision; N parallel runs share the bridge cleanly
 #   + `if: always()` cleanup runs even on test-step failure
 #   - One more step in the workflow (+~3 lines)
 #   - Requires `molecule-core-net` to exist on the operator host
 #     (it does; declared in docker-compose.yml + docker-compose.infra.yml)
 #
 # Class B Hongming-owned CICD red sweep, 2026-05-08.
 #
 # Cost: ~30s job (postgres pull from cache + go build + 4 tests).
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
 concurrency:
  group: handlers-pg-integ-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    name: detect-changes
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      handlers: ${{ steps.filter.outputs.handlers }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: filter
        # Inline replacement for dorny/paths-filter — see e2e-api.yml.
        run: |
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            echo "handlers=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "handlers=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(workspace-server/internal/handlers/|workspace-server/internal/wsauth/|workspace-server/migrations/|\.gitea/workflows/handlers-postgres-integration\.yml$)'; then
            echo "handlers=true" >> "$GITHUB_OUTPUT"
          else
            echo "handlers=false" >> "$GITHUB_OUTPUT"
          fi
  # Single-job-with-per-step-if pattern: always runs to satisfy the
  # required-check name on branch protection; real work gates on the
  # paths filter. See ci.yml's Platform (Go) for the same shape.
  integration:
    name: Handlers Postgres Integration
    needs: detect-changes
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    env:
      # Unique name per run so concurrent jobs don't collide on the
      # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
      # workflow_dispatch reruns of the same run_id.
      PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
      # Bridge network already exists on the operator host (declared
      # in docker-compose.yml + docker-compose.infra.yml).
      PG_NETWORK: molecule-core-net
    defaults:
      run:
        working-directory: workspace-server
    steps:
      - if: needs.detect-changes.outputs.handlers != 'true'
        working-directory: .
        run: echo "No handlers/migrations changes — skipping; this job always runs to satisfy the required-check name."
      - if: needs.detect-changes.outputs.handlers == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.detect-changes.outputs.handlers == 'true'
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Start sibling Postgres on bridge network
        working-directory: .
        run: |
          # Sanity: the bridge network must exist on the operator host.
          # Hard-fail loud if it doesn't — easier to spot than a silent
          # auto-create that diverges from the rest of the stack.
          if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
            echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
            exit 1
          fi
          # If a stale container with the same name exists (rerun on
          # the same run_id), wipe it first.
          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
          docker run -d \
            --name "${PG_NAME}" \
            --network "${PG_NETWORK}" \
            --health-cmd "pg_isready -U postgres" \
            --health-interval 5s \
            --health-timeout 5s \
            --health-retries 10 \
            -e POSTGRES_PASSWORD=test \
            -e POSTGRES_DB=molecule \
            postgres:15-alpine >/dev/null
          # Read back the bridge IP. Always present immediately after
          # `docker run -d` for bridge networks.
          PG_HOST=$(docker inspect "${PG_NAME}" \
            --format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
          if [ -z "${PG_HOST}" ]; then
            echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
            docker logs "${PG_NAME}" || true
            exit 1
          fi
          echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
          echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
          echo "Started ${PG_NAME} at ${PG_HOST}:5432"
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Apply migrations to Postgres service
        env:
          PGPASSWORD: test
        run: |
          # Wait for postgres to actually accept connections. Docker's
          # health-cmd handles container-side readiness, but the wire
          # to the bridge IP is best-tested with pg_isready directly.
          for i in {1..15}; do
            if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
            echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
          done
          # Apply every .up.sql in lexicographic order with
          # ON_ERROR_STOP=0 — failing migrations are SKIPPED rather than
          # blocking the suite. This handles the current schema state
          # where a few historical migrations (e.g. 017_memories_fts_*)
          # depend on tables that were later renamed/dropped and so
          # cannot replay from scratch. The migrations that DO succeed
          # land their tables, which is sufficient for the integration
          # tests in handlers/.
          #
          # Why not maintain a curated allowlist: every new migration
          # touching a handlers/-tested table would have to update this
          # workflow. With apply-all-or-skip, a future migration that
          # adds a column to delegations runs automatically (its base
          # table 049_delegations.up.sql already succeeded above it in
          # the order). Operators only need to revisit this if the
          # migration chain becomes legitimately replayable end-to-end.
          #
          # Per-migration result is logged so a failed migration that
          # SHOULD have been replayable surfaces in the CI log instead
          # of silently failing.
          # Apply both *.sql (legacy, lives next to its module) and
          # *.up.sql (newer up/down convention) in a single
          # lexicographically-sorted pass. Excluding *.down.sql so the
          # newest-naming-convention pairs don't undo themselves mid-run.
          # Pre-#149-followup this loop only globbed *.up.sql, which
          # silently skipped 001_workspaces.sql + 009_activity_logs.sql
          # — fine while no integration test depended on those tables,
          # not fine once a cross-table atomicity test came in.
          set +e
          for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
            if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
                  -f "$migration" >/dev/null 2>&1; then
              echo "✓ $(basename "$migration")"
            else
              echo "⊘ $(basename "$migration") (skipped — see comment in workflow)"
            fi
          done
          set -e
          # Sanity: the delegations + workspaces + activity_logs tables
          # MUST exist for the integration tests to be meaningful. Hard-
          # fail if any didn't land — that would be a real regression we
          # want loud.
          for tbl in delegations workspaces activity_logs pending_uploads; do
            if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
                -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
                | grep -q 1; then
              echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
              exit 1
            fi
            echo "✓ $tbl table present"
          done
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Run integration tests
        run: |
          # INTEGRATION_DB_URL is exported by the start-postgres step;
          # points at the per-run bridge IP, not 127.0.0.1, so concurrent
          # workflow runs don't fight over a host-net 5432 port.
          go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
      - if: failure() && needs.detect-changes.outputs.handlers == 'true'
        name: Diagnostic dump on failure
        env:
          PGPASSWORD: test
        run: |
          echo "::group::postgres container status"
          docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
          docker logs "${PG_NAME}" 2>&1 | tail -50 || true
          echo "::endgroup::"
          echo "::group::delegations table state"
          psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
          echo "::endgroup::"
      - if: always() && needs.detect-changes.outputs.handlers == 'true'
        name: Stop sibling Postgres
        working-directory: .
        run: |
          # always() so containers don't leak when migrations or tests
          # fail. The cleanup is best-effort: if the container is
          # already gone (e.g. concurrent rerun race), don't fail the job.
          docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
          echo "Cleaned up ${PG_NAME}"
--- a/.gitea/workflows/harness-replays.yml
+++ b/.gitea/workflows/harness-replays.yml
@ -0,0 +1,296 @@
 name: Harness Replays
 # Ported from .github/workflows/harness-replays.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Boots tests/harness (production-shape compose topology with TenantGuard,
 # /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
 # every replay under tests/harness/replays/. Fails the PR if any replay
 # fails.
 #
 # Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
 # a public route in router.go but forgot to add it to TenantGuard's
 # allowlist. The handler-level test in buildinfo_test.go constructed a
 # minimal gin engine without TenantGuard — green. The harness's
 # buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
 # inject X-Molecule-Org-Id, so the curl path is identical to production's
 # redeploy verifier), but no one ran the harness pre-merge. The bug
 # shipped; the redeploy verifier silently soft-warned every tenant as
 # "unreachable" for ~1 day before being noticed.
 #
 # This gate makes "did you actually run the harness?" a CI invariant
 # instead of a memory-discipline thing.
 #
 # Trigger model — match e2e-api.yml: always FIRES on push/pull_request
 # to staging+main, real work is gated per-step on detect-changes output.
 # One job → one check run → branch-protection-clean (the SKIPPED-in-set
 # trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
 on:
  push:
    branches: [main, staging]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'tests/harness/**'
      - '.gitea/workflows/harness-replays.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'tests/harness/**'
      - '.gitea/workflows/harness-replays.yml'
 concurrency:
  # Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
  # cancellation deadlock — see e2e-api.yml's concurrency block for
  # the 2026-04-28 incident that codified this pattern.
  group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      run: ${{ steps.decide.outputs.run }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Fetch base branch tip for diff
        continue-on-error: true
        run: |
          # With the default fetch-depth: 1, actions/checkout only fetches the
          # PR head commit. The base commit is NOT in the local history, so
          # `git diff "$BASE" "$GITHUB_SHA"` fails. Fetch the base branch at
          # depth 1 — the base commit is the immediate parent of the PR head
          # on the base branch, so depth=1 is sufficient.
          #
          # Network: Gitea Actions runner (5.78.80.188) cannot reach the git
          # remote over HTTPS (confirmed: git fetch times out at ~15s). The runner
          # is on the same host as Gitea, but the container network namespace
          # cannot reach the Gitea HTTPS endpoint.
          #
          # Fallback: if the base commit does not exist locally, skip the diff
          # and set run=true (always run harness). This is safe: PRs where the
          # base is unavailable still run the harness (correct), PRs where the
          # base IS available get the correct path-based diff.
          #
          # Timeout: 20s. If the fetch completes, great. If it times out, the
          # step exits non-zero and we fall through to run=true.
          if timeout 20 git fetch origin "${{ github.event.pull_request.base.ref }}" --depth=1; then
            echo "::notice::base branch fetched successfully"
          else
            echo "::warning::git fetch origin ${{ github.event.pull_request.base.ref }} --depth=1 timed out"
            echo "::warning::Skipping diff — detect-changes will run the harness unconditionally."
          fi
      - id: decide
        continue-on-error: true
        run: |
          # workflow_dispatch: always run (manual trigger)
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=manual-trigger" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Determine the base commit to diff against.
          # For pull_request: use base.sha (the merge-base with main/staging).
          # For push: use github.event.before (the previous tip of the branch).
          # Fallback for new branches (all-zeros SHA): run everything.
          if [ "${{ github.event_name }}" = "pull_request" ] && \
             [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          elif [ -n "${{ github.event.before }}" ] && \
               ! echo "${{ github.event.before }}" | grep -qE '^0+$'; then
            BASE="${{ github.event.before }}"
          else
            # New branch or github.event.before unavailable — run everything.
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=new-branch-fallback" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # GitHub Actions and Gitea Actions both expose github.sha for HEAD.
          # git diff exits 1 when BASE is not in local history (e.g. shallow
          # checkout where the base commit was never fetched). Capture and
          # swallow that exit code — the empty diff means "run everything".
          # The runner network cannot reach the git remote (confirmed: git fetch
          # times out at ~15s), so a failed fetch is expected and we always fall
          # through to the unconditional run=true below.
          DIFF=$(git diff --name-only "$BASE" "${{ github.sha }}" 2>/dev/null) || true
          echo "debug=diff-base=$BASE diff-files=$DIFF" >> "$GITHUB_OUTPUT"
          if echo "$DIFF" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.gitea/workflows/harness-replays\.yml$'; then
            echo "run=true" >> "$GITHUB_OUTPUT"
          else
            echo "run=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job that always runs. Real work is gated per-step on
  # detect-changes.outputs.run so an unrelated PR (e.g. doc-only
  # change to molecule-controlplane wired here later) emits the
  # required check without spending CI cycles. Single-job pattern
  # matches e2e-api.yml — see that workflow's comment for why a
  # job-level `if: false` would block branch protection via the
  # SKIPPED-in-set bug.
  harness-replays:
    needs: detect-changes
    name: Harness Replays
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 30
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.run != 'true'
        run: |
          echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
          echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
          echo "::notice::Debug: ${{ needs.detect-changes.outputs.debug }}"
      - if: needs.detect-changes.outputs.run == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # Log what files were detected so future failures include the diff.
      - name: Log detected changes
        if: needs.detect-changes.outputs.run == 'true'
        run: |
          echo "::notice::detect-changes debug: ${{ needs.detect-changes.outputs.debug }}"
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # the plugin was dropped + Dockerfile.tenant no longer COPYs it.
      # Pre-clone manifest deps before docker compose builds the tenant
      # image (Task #173 followup — same pattern as
      # publish-workspace-server-image.yml's "Pre-clone manifest deps"
      # step).
      #
      # Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha
      # and tenant-beta from workspace-server/Dockerfile.tenant with
      # context=../.. (repo root). That Dockerfile expects
      # .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins}
      # to be present at build context root (post-#173 it COPYs from there
      # instead of running an in-image clone — the in-image clone failed
      # with "could not read Username for https://git.moleculesai.app"
      # because there's no auth path inside the build sandbox).
      #
      # Without this step harness-replays fails before any replay runs,
      # with `failed to calculate checksum of ref ...
      # "/.tenant-bundle-deps/plugins": not found`. Caught by run #892
      # (main, 2026-05-07T20:28:53Z) and run #964 (staging — same
      # symptom, different root cause: staging still has the in-image
      # clone path, hits the auth error directly).
      #
      # 2026-05-08 sub-finding (#192): the clone step ALSO fails when
      # any referenced workspace-template repo is private and the
      # AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read
      # access. Root cause: 5 of 9 workspace-template repos
      # (openclaw, codex, crewai, deepagents, gemini-cli) had been
      # marked private with no team grant. Resolution: flipped them
      # to public per `feedback_oss_first_repo_visibility_default`
      # (the OSS surface should be public). Layer-3 (customer-private +
      # marketplace third-party repos) tracked separately in
      # internal#102.
      #
      # Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
      # is the devops-engineer persona PAT, NOT the founder PAT (per
      # `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
      # embeds it as basic-auth for the duration of the clones and strips
      # .git directories — the token never enters the resulting image.
      - name: Pre-clone manifest deps
        if: needs.detect-changes.outputs.run == 'true'
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
            echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
            exit 1
          fi
          mkdir -p .tenant-bundle-deps
          bash scripts/clone-manifest.sh \
            manifest.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          # Sanity-check counts so a silent partial clone fails fast
          # instead of producing a half-empty image.
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
      - name: Install Python deps for replays
        # peer-discovery-404 (and future replays) eval Python against the
        # running tenant — importing workspace/a2a_client.py pulls in
        # httpx. tests/harness/requirements.txt holds just the HTTP-client
        # surface to keep CI install fast (~3s) vs the full
        # workspace/requirements.txt (~30s).
        if: needs.detect-changes.outputs.run == 'true'
        run: pip install -r tests/harness/requirements.txt
      - name: Run all replays against the harness
        # run-all-replays.sh: boot via up.sh → seed via seed.sh → run
        # every replays/*.sh → tear down via down.sh on EXIT (trap).
        # Non-zero exit on any replay failure.
        #
        # KEEP_UP=1: without this, the script's trap-on-EXIT tears
        # down containers immediately on failure, leaving the dump
        # step below with nothing to dump (verified on PR #2410's
        # first run — tenant became unhealthy, trap fired, dump
        # step saw empty containers). Keeping them up lets the
        # failure path collect tenant/cp-stub/cf-proxy logs. The
        # always-run "Force teardown" step does the actual cleanup.
        if: needs.detect-changes.outputs.run == 'true'
        working-directory: tests/harness
        env:
          KEEP_UP: "1"
        run: ./run-all-replays.sh
      - name: Dump compose logs on failure
        # SECRETS_ENCRYPTION_KEY: docker compose validates the entire compose
        # file even for read-only `logs` calls. up.sh generates a per-run key
        # and exports it to its OWN shell — this step runs in a fresh shell
        # that wouldn't see it, so without a placeholder the validate step
        # errors before logs print (verified against PR #2492's first run:
        # "required variable SECRETS_ENCRYPTION_KEY is missing a value").
        # A placeholder is fine — we're only reading log streams, not booting.
        if: failure() && needs.detect-changes.outputs.run == 'true'
        working-directory: tests/harness
        env:
          SECRETS_ENCRYPTION_KEY: dump-logs-placeholder
        run: |
          echo "=== docker compose ps ==="
          docker compose -f compose.yml ps || true
          echo "=== tenant-alpha logs ==="
          docker compose -f compose.yml logs tenant-alpha || true
          echo "=== tenant-beta logs ==="
          docker compose -f compose.yml logs tenant-beta || true
          echo "=== cp-stub logs ==="
          docker compose -f compose.yml logs cp-stub || true
          echo "=== cf-proxy logs ==="
          docker compose -f compose.yml logs cf-proxy || true
          echo "=== postgres-alpha logs (last 100) ==="
          docker compose -f compose.yml logs --tail 100 postgres-alpha || true
          echo "=== postgres-beta logs (last 100) ==="
          docker compose -f compose.yml logs --tail 100 postgres-beta || true
      - name: Force teardown
        # We pass KEEP_UP=1 to run-all-replays.sh so the dump step
        # above sees real containers — that means we own teardown
        # explicitly here. Always run.
        if: always() && needs.detect-changes.outputs.run == 'true'
        working-directory: tests/harness
        run: ./down.sh || true
--- a/.gitea/workflows/lint-curl-status-capture.yml
+++ b/.gitea/workflows/lint-curl-status-capture.yml
@ -0,0 +1,104 @@
 name: Lint curl status-code capture
 # Ported from .github/workflows/lint-curl-status-capture.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths and the lint scanner target .gitea/workflows/**.yml (the
 #     active Gitea workflow directory) instead of .github/workflows/**.yml
 #     (which the rest of this sweep is emptying out).
 #   - Self-skip path updated to the .gitea/ version of this file.
 #   - Dropped `merge_group:` trigger.
 #   - Workflow-level env.GITHUB_SERVER_URL set per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Pins the workflow-bash anti-pattern that produced "HTTP 000000" on the
 # 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6:
 #
 #   HTTP_CODE=$(curl ... -w '%{http_code}' ... || echo "000")
 #
 # When curl exits non-zero (connection reset -> 56, --fail-with-body 4xx/5xx
 # -> 22), the `-w '%{http_code}'` already wrote a status to stdout — usually
 # "000" for connection failures or the actual code for HTTP errors. The
 # `|| echo "000"` then fires AND appends ANOTHER "000" to the captured
 # stdout, producing values like "000000" or "409000" that fail string
 # comparisons against "200" while looking superficially right.
 #
 # Same class of bug the synth-E2E §7c gate hit twice (PRs #2779/#2783 +
 # #2797). Memory: feedback_curl_status_capture_pollution.md.
 on:
  pull_request:
    paths: ['.gitea/workflows/**']
  push:
    branches: [main, staging]
    paths: ['.gitea/workflows/**']
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  scan:
    name: Scan workflows for curl status-capture pollution
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Find curl ... -w '%{http_code}' ... || echo "000" subshells
        run: |
          set -uo pipefail
          # Multi-line aware: look for `$(curl ... -w '%{http_code}' ... || echo "000")`
          # subshell where the entire command-substitution wraps a curl that
          # ends with `|| echo "000"`. Must distinguish from the SAFE shape
          # `$(cat tempfile 2>/dev/null || echo "000")` — `cat` with a missing
          # tempfile produces empty stdout, no pollution.
          python3 <<'PY'
          import os, re, sys, glob
          BAD_FILES = []
          # Match the buggy substitution across newlines: $(curl ... -w '%{http_code}' ... || echo "000")
          # The `\\n` is the bash line-continuation that lets curl flags span lines.
          # We collapse continuation lines first, then look for the single-line bad pattern.
          PATTERN = re.compile(
              r'\$\(\s*curl\b[^)]*-w\s*[\'"]%\{http_code\}[\'"][^)]*\|\|\s*echo\s+"000"\s*\)',
              re.DOTALL,
          )
          # Self-skip: this lint workflow contains the literal anti-pattern in
          # its own docstring — that's intentional, not a bug.
          SELF = ".gitea/workflows/lint-curl-status-capture.yml"
          for f in sorted(glob.glob(".gitea/workflows/*.yml")):
              if f == SELF:
                  continue
              with open(f) as fh:
                  content = fh.read()
              # Collapse bash line-continuations (\\\n + leading whitespace)
              # into a single logical line so the regex can see the full
              # curl invocation as one chunk.
              flat = re.sub(r'\\\s*\n\s*', ' ', content)
              for m in PATTERN.finditer(flat):
                  BAD_FILES.append((f, m.group(0)[:120]))
          if not BAD_FILES:
              print("OK No curl-status-capture pollution patterns detected")
              sys.exit(0)
          print(f"::error::Found {len(BAD_FILES)} curl-status-capture pollution site(s):")
          for f, snippet in BAD_FILES:
              print(f"::error file={f}::Curl status-capture pollution: '|| echo \"000\"' inside a $(curl ... -w '%{{http_code}}' ...) subshell. On non-2xx or connection failure, curl's -w writes a status, then exits non-zero, then the || echo appends another '000' — producing 'HTTP 000000' or '409000' that fails comparisons silently. Fix: route -w into a tempfile so the exit code can't pollute stdout. See memory feedback_curl_status_capture_pollution.md.")
              print(f"   matched: {snippet}...")
          print()
          print("Fix template:")
          print('  set +e')
          print('  curl ... -w \'%{http_code}\' >code.txt 2>/dev/null')
          print('  set -e')
          print('  HTTP_CODE=$(cat code.txt 2>/dev/null)')
          print('  [ -z "$HTTP_CODE" ] && HTTP_CODE="000"')
          sys.exit(1)
          PY
--- a/.gitea/workflows/main-red-watchdog.yml
+++ b/.gitea/workflows/main-red-watchdog.yml
@ -0,0 +1,94 @@
 # main-red-watchdog — hourly sentinel for post-merge CI red on `main`.
 #
 # RFC: hongming "main NEVER goes red" directive, Option C of the four-
 # option ladder (B = auto-revert is explicitly rejected per
 # `feedback_no_such_thing_as_flakes` + `feedback_fix_root_not_symptom`).
 # Tracking issue: molecule-core#420.
 #
 # What it does:
 #   1. GET branches/main → HEAD SHA
 #   2. GET commits/{SHA}/status → combined status
 #   3. If combined is `failure` (or any individual status is `failure`):
 #      open or PATCH an idempotent `[main-red] {repo}: {SHA[:10]}` issue
 #      with each failed context + target_url + description.
 #   4. If combined is `success` and a prior `[main-red] ...` issue exists,
 #      close it with a "main returned to green at SHA ..." comment.
 #   5. Emit a Loki-shaped JSON line via `logger -t main-red-watchdog` for
 #      `reference_obs_stack_phase1` ingestion via Vector.
 #
 # What it does NOT do:
 #   - Auto-revert anything. Option B is rejected by directive.
 #   - Mutate branch protection. (See AGENTS.md boundaries.)
 #   - Fail the workflow on red. The issue IS the alarm — failing the
 #     watchdog would create a silent-loop where a flake in the watchdog
 #     itself hides actual main-red signal. Exit 0 unless api() raises
 #     ApiError (transient Gitea outage → fail loudly per
 #     `feedback_api_helper_must_raise_not_return_dict`).
 #
 # Pattern source: molecule-controlplane `0adf2098`'s ci-required-drift.yml
 # (just merged 2026-05-11). Same shape (cron + dispatch + sidecar Python +
 # idempotent-by-title issue), simpler scope (1 source, not 3).
 name: main-red-watchdog
 # IMPORTANT — Gitea 1.22.6 parser quirk per
 # `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
 # `inputs:` block here. Gitea 1.22.6 rejects the whole workflow as
 # "unknown on type" when `workflow_dispatch.inputs.X` is present. Revisit
 # when Gitea ≥ 1.23 is fleet-wide.
 on:
  schedule:
    # Hourly at :05 — task spec calls for "off-zero" (`5 * * * *`),
    # offset from :17 (ci-required-drift) and :00 (peak cron load).
    - cron: '5 * * * *'
  workflow_dispatch:
 # Read commit status + branch ref + issues; write issues (open/PATCH/close).
 permissions:
  contents: read
  issues: write
 # Workflow-scoped serialisation — two simultaneous runs would race on the
 # `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
 # POSTs can produce duplicates before the title search dedup wins.
 concurrency:
  group: main-red-watchdog
  cancel-in-progress: false
 jobs:
  watchdog:
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - name: Check out repo (script lives at .gitea/scripts/)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - name: Set up Python (stdlib only — no PyYAML needed here)
        # The script uses stdlib urllib + json. No PyYAML required (CP's
        # drift detector needs it for AST parsing; we don't). Pin to the
        # same 3.12 hermetic interpreter CP uses so the test/runtime
        # versions stay aligned across watchdog suites.
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
        with:
          python-version: '3.12'
      - name: Run main-red watchdog
        env:
          # GITEA_TOKEN reads commit status + writes issues. Falls back
          # to the auto-injected GITHUB_TOKEN if the org-level secret
          # isn't set (transitional repos), matching the same pattern
          # used by deploy-pipeline.yml + ci-required-drift.yml.
          GITEA_TOKEN: ${{ secrets.GITEA_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          # Branch under watch. `main` per directive; staging not
          # included here — staging green is a separate gate
          # (`feedback_staging_e2e_merge_gate`).
          WATCH_BRANCH: 'main'
          # Issue label applied on file/open. `tier:high` exists in the
          # molecule-core label set (verified 2026-05-11, label id 9).
          # Rationale for high: main red blocks the promotion train and
          # poisons every PR's auto-rebase base; treat as a fire even
          # if intermittent.
          RED_LABEL: 'tier:high'
        run: python3 .gitea/scripts/main-red-watchdog.py
--- a/.gitea/workflows/publish-canvas-image.yml
+++ b/.gitea/workflows/publish-canvas-image.yml
@ -0,0 +1,138 @@
 name: publish-canvas-image
 # Ported from .github/workflows/publish-canvas-image.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - **Open question for review**: this workflow pushes the canvas
 #     image to `ghcr.io`. GHCR was retired during the 2026-05-06
 #     Gitea migration in favor of ECR (per staging-verify.yml header
 #     notes). The image may not be consumable post-migration. Two
 #     options for follow-up: (a) retarget to
 #     `153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas`,
 #     or (b) retire this workflow entirely and route canvas deploys
 #     via the operator-host build path. tier:low + continue-on-error
 #     means failed pushes do not block PRs.
 #
 # Builds and pushes the canvas Docker image to GHCR whenever a commit lands
 # on main that touches canvas code. Previously canvas changes were visible in
 # CI (npm run build passed) but the live container was never updated —
 # operators had to manually run `docker compose build canvas` each time.
 #
 # Mirror of publish-platform-image.yml, adapted for the Next.js canvas layer.
 # See that workflow for inline notes on macOS Keychain isolation and QEMU.
 on:
  push:
    branches: [main]
    paths:
      # Only rebuild when canvas source changes — saves GHA minutes on
      # platform-only / docs-only / MCP-only merges.
      - 'canvas/**'
      - '.gitea/workflows/publish-canvas-image.yml'
  # NOTE (Gitea port): the original GitHub workflow had a
  # `workflow_dispatch:` manual trigger for the
  # non-canvas-merge-but-need-fresh-image scenario. Dropped in the
  # Gitea port (1.22.6 parser-finicky). Manual rebuilds require
  # pushing an empty commit to canvas/ or running the operator-host
  # build directly.
 permissions:
  contents: read
  packages: write  # required to push to ghcr.io/${{ github.repository_owner }}/*
 env:
  IMAGE_NAME: ghcr.io/molecule-ai/canvas
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  build-and-push:
    name: Build & push canvas image
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Log in to GHCR
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
      # Health check: verify Docker daemon is accessible before attempting any
      # build steps. This fails loudly at step 1 when the runner's docker.sock
      # is inaccessible rather than silently continuing to the build step
      # where docker build fails deep in ECR auth with a cryptic error.
      - name: Verify Docker daemon access
        run: |
          set -euo pipefail
          echo "::group::Docker daemon health check"
          docker info 2>&1 | head -5 || {
            echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
            echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
            exit 1
          }
          echo "Docker daemon OK"
          echo "::endgroup::"
      - name: Compute tags
        id: tags
        shell: bash
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      - name: Resolve build args
        id: build_args
        # Priority: workflow_dispatch input > repo secret > hardcoded default.
        # NEXT_PUBLIC_* env vars are baked into the JS bundle at build time by
        # Next.js — they cannot be changed at runtime without a full rebuild.
        # For local docker-compose deployments the defaults (localhost:8080)
        # work as-is; production deployments should set CANVAS_PLATFORM_URL
        # and CANVAS_WS_URL as repository secrets.
        #
        # Inputs are passed via env vars (not direct ${{ }} interpolation) to
        # prevent shell injection from workflow_dispatch string inputs.
        shell: bash
        env:
          INPUT_PLATFORM_URL: ${{ github.event.inputs.platform_url }}
          SECRET_PLATFORM_URL: ${{ secrets.CANVAS_PLATFORM_URL }}
          INPUT_WS_URL: ${{ github.event.inputs.ws_url }}
          SECRET_WS_URL: ${{ secrets.CANVAS_WS_URL }}
        run: |
          PLATFORM_URL="${INPUT_PLATFORM_URL:-${SECRET_PLATFORM_URL:-http://localhost:8080}}"
          WS_URL="${INPUT_WS_URL:-${SECRET_WS_URL:-ws://localhost:8080/ws}}"
          echo "platform_url=${PLATFORM_URL}" >> "$GITHUB_OUTPUT"
          echo "ws_url=${WS_URL}" >> "$GITHUB_OUTPUT"
      - name: Build & push canvas image to GHCR
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: ./canvas
          file: ./canvas/Dockerfile
          platforms: linux/amd64
          push: true
          build-args: |
            NEXT_PUBLIC_PLATFORM_URL=${{ steps.build_args.outputs.platform_url }}
            NEXT_PUBLIC_WS_URL=${{ steps.build_args.outputs.ws_url }}
          tags: |
            ${{ env.IMAGE_NAME }}:latest
            ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow)
--- a/.gitea/workflows/publish-runtime-autobump.yml
+++ b/.gitea/workflows/publish-runtime-autobump.yml
@ -0,0 +1,108 @@
 name: publish-runtime-autobump
 # Auto-bump-on-workspace-edit half of the publish pipeline.
 #
 # Why this file exists (issue #351):
 #   Gitea Actions does not correctly disambiguate `paths:` from `tags:`
 #   when both are bundled under a single `on.push` key. The result is
 #   that tag pushes get filtered out and `publish-runtime.yml` never
 #   fires — `action_run` rows: 0. This was unnoticed pre-2026-05-11
 #   because PYPI_TOKEN was absent (publishes would have failed anyway).
 #
 #   Split design:
 #     - publish-runtime.yml         : on.push.tags only        (the publisher)
 #     - publish-runtime-autobump.yml: on.push.branches+paths   (this file — the version-bumper)
 #
 #   This file computes the next version from PyPI's latest, pushes a
 #   `runtime-v$VERSION` tag, and exits. The tag push then triggers
 #   publish-runtime.yml via its tags-only trigger.
 #
 # Concurrency: shares the `publish-runtime` group with publish-runtime.yml
 # so concurrent workspace pushes serialize at the bump step. Without
 # this, two pushes minutes apart could both read PyPI latest=0.1.129
 # and try to tag 0.1.130 simultaneously, only one of which would land.
 on:
  push:
    branches:
      - main
      - staging
    paths:
      - "workspace/**"
 permissions:
  contents: write  # required to push tags back
 concurrency:
  group: publish-runtime
  cancel-in-progress: false
 jobs:
  autobump-and-tag:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          # Shallow clone — depth 1 is enough for the workspace-diff check.
          # Tags needed for the collision check below are fetched explicitly
          # in the next step, bypassing the runner-network timeout that
          # full-history fetch triggers on Gitea Actions runners
          # (runbooks/gitea-operational-quirks.md §runner-network-isolation).
          fetch-depth: 1
      - name: Fetch tags for collision check
        # fetch-depth: 1 gets only the most recent commit's refs, not the
        # tag that points at it. Do a targeted tag fetch so git tag --list
        # below can detect collision with prior manual pushes.
        run: git fetch origin --tags --depth=1
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
      - name: Compute next version from PyPI latest
        id: bump
        run: |
          set -eu
          LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
            | python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
          MAJOR=$(echo "$LATEST" | cut -d. -f1)
          MINOR=$(echo "$LATEST" | cut -d. -f2)
          PATCH=$(echo "$LATEST" | cut -d. -f3)
          VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
          echo "PyPI latest=$LATEST -> next=$VERSION"
          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+$'; then
            echo "::error::computed version $VERSION does not match PEP 440 X.Y.Z"
            exit 1
          fi
          if git tag --list | grep -qx "runtime-v$VERSION"; then
            echo "::error::tag runtime-v$VERSION already exists in this repo. Manual intervention required (PyPI and Gitea tag history are out of sync)."
            exit 1
          fi
          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
      - name: Push runtime-v$VERSION tag
        env:
          DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
          VERSION: ${{ steps.bump.outputs.version }}
          GITEA_URL: https://git.moleculesai.app
        run: |
          set -eu
          if [ -z "$DISPATCH_TOKEN" ]; then
            echo "::error::DISPATCH_TOKEN secret is not set — needed to push the tag back to molecule-core."
            exit 1
          fi
          git config user.name  "publish-runtime autobump"
          git config user.email "publish-runtime@moleculesai.app"
          git tag -a "runtime-v$VERSION" \
            -m "Auto-bump on workspace/** edit on $GITHUB_REF" \
            -m "Triggered by: $GITHUB_REF @ $GITHUB_SHA" \
            -m "publish-runtime.yml will pick up this tag and upload to PyPI"
          # Push via DISPATCH_TOKEN (a Gitea PAT). Using the bot identity
          # ensures the resulting tag-push event is dispatched to
          # publish-runtime.yml; act_runner's default GITHUB_TOKEN cannot
          # trigger downstream workflows.
          git remote set-url origin "${GITEA_URL#https://}"
          git remote set-url origin "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/molecule-ai/molecule-core.git"
          git push origin "runtime-v$VERSION"
          echo "✓ pushed runtime-v$VERSION — publish-runtime.yml should fire next"
--- a/.gitea/workflows/publish-runtime.yml
+++ b/.gitea/workflows/publish-runtime.yml
@ -0,0 +1,339 @@
 name: publish-runtime
 # Gitea Actions port of .github/workflows/publish-runtime.yml.
 #
 # Ported 2026-05-10 (issue #206). Key differences from the GitHub version:
 #   - Gitea Actions reads .gitea/workflows/, not .github/workflows/
 #   - Dropped `environment: pypi-publish` — Gitea Actions does not support
 #     named environments or OIDC trusted publishers
 #   - Replaced `pypa/gh-action-pypi-publish@release/v1` (OIDC) with
 #     `twine upload` using PYPI_TOKEN secret — same mechanism as a local
 #     `python -m twine upload` with a PyPI token
 #   - Replaced `github.ref_name` (GitHub-only) with `${GITHUB_REF#refs/tags/}`
 #     — Gitea Actions exposes github.ref (the full ref) but not ref_name
 #   - Dropped `merge_group` trigger (Gitea has no merge queue)
 #
 # 2026-05-10 (issue #348): originally restored `staging`/`main` branch +
 # `workspace/**` path-filter trigger in PR #349.
 #
 # 2026-05-11 (issue #351): REVERTED the branches+paths trigger from THIS
 # file. Bundling `paths` with `tags` under a single `on.push` key caused
 # Gitea Actions to never dispatch the workflow for tag-push events (0
 # runs in `action_run` for workflow_id='publish-runtime.yml' since the
 # port, including the runtime-v1.0.0 tag — which is why PyPI is still at
 # 0.1.129 despite a v1.0.0 Gitea tag existing).
 #
 # The auto-bump-on-workspace-edit trigger now lives in
 # `.gitea/workflows/publish-runtime-autobump.yml`. That file computes the
 # next version from PyPI's latest and pushes a `runtime-v$VERSION` tag,
 # which THIS file then picks up via the tags-only trigger below.
 #
 # This decoupling means Gitea's path-vs-tag evaluator never has to
 # disambiguate — each file has a single unambiguous trigger shape.
 #
 # PyPI publishing: requires PYPI_TOKEN repository secret (or org-level secret).
 # Set via: repo Settings → Actions → Variables and Secrets → New Secret.
 # The token should be a PyPI API token scoped to molecule-ai-workspace-runtime.
 #
 # The DISPATCH_TOKEN cascade (git push to template repos) is unchanged —
 # it uses the Gitea API directly and was already Gitea-compatible.
 on:
  push:
    tags:
      - "runtime-v*"
  workflow_dispatch:
  # 2026-05-11 (root cause of #351 / 0 runs ever):
  # Gitea 1.22.6's workflow parser rejects `workflow_dispatch.inputs.version`
  # with "unknown on type" — it mis-treats the inputs sub-keys as top-level
  # `on:` event types. Log line:
  #   actions/workflows.go:DetectWorkflows() [W] ignore invalid workflow
  #   "publish-runtime.yml": unknown on type: map["version": {...}]
  # That `[W] ignore invalid workflow` is silent UX — the workflow never
  # registers, so it never fires for ANY event (push.tags included).
  # Removing the inputs block restores parsing. Manual dispatch from the
  # Gitea UI now triggers the PyPI auto-bump fallback in `Derive version`
  # below (no `inputs.version` to read).
 permissions:
  contents: read
 # Serialize publishes so two concurrent tag pushes don't both compute
 # "latest+1" and race on PyPI upload. The second one waits.
 concurrency:
  group: publish-runtime
  cancel-in-progress: false
 jobs:
  publish:
    runs-on: ubuntu-latest
    outputs:
      version: ${{ steps.version.outputs.version }}
      wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
          cache: pip
      - name: Derive version (tag or PyPI auto-bump)
        id: version
        run: |
          if echo "$GITHUB_REF" | grep -q "^refs/tags/runtime-v"; then
            # Tag is `runtime-vX.Y.Z` — strip the prefix.
            VERSION="${GITHUB_REF#refs/tags/runtime-v}"
          else
            # workflow_dispatch path (no inputs supported on Gitea 1.22.6) or
            # any other non-tag trigger: derive from PyPI latest + patch bump.
            LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
              | python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
            MAJOR=$(echo "$LATEST" | cut -d. -f1)
            MINOR=$(echo "$LATEST" | cut -d. -f2)
            PATCH=$(echo "$LATEST" | cut -d. -f3)
            VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
            echo "Auto-bumped from PyPI latest $LATEST -> $VERSION"
          fi
          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
            echo "::error::version $VERSION does not match PEP 440"
            exit 1
          fi
          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
          echo "Publishing molecule-ai-workspace-runtime $VERSION"
      - name: Install build tooling
        run: pip install build twine
      - name: Build package from workspace/
        run: |
          python scripts/build_runtime_package.py \
            --version "${{ steps.version.outputs.version }}" \
            --out "${{ runner.temp }}/runtime-build"
      - name: Build wheel + sdist
        working-directory: ${{ runner.temp }}/runtime-build
        run: python -m build
      - name: Capture wheel SHA256 for cascade content-verification
        id: wheel_hash
        working-directory: ${{ runner.temp }}/runtime-build
        run: |
          set -eu
          WHEEL=$(ls dist/*.whl 2>/dev/null | head -1)
          if [ -z "$WHEEL" ]; then
            echo "::error::No .whl in dist/ — \`python -m build\` must have failed silently"
            exit 1
          fi
          HASH=$(sha256sum "$WHEEL" | awk '{print $1}')
          echo "wheel_sha256=${HASH}" >> "$GITHUB_OUTPUT"
          echo "Local wheel SHA256 (pre-upload): ${HASH}"
          echo "Wheel filename: $(basename "$WHEEL")"
      - name: Verify package contents (sanity)
        working-directory: ${{ runner.temp }}/runtime-build
        run: |
          python -m twine check dist/*
          python -m venv /tmp/smoke
          /tmp/smoke/bin/pip install --quiet dist/*.whl
          /tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
      - name: Publish to PyPI
        # working-directory matches the preceding Build/Verify steps. Without
        # this, twine runs from the default workspace checkout dir where
        # `dist/` doesn't exist and fails with:
        #   ERROR InvalidDistribution: Cannot find file (or expand pattern): 'dist/*'
        # Caught on the first-ever successful dispatch of this workflow
        # (run 5097, 2026-05-11 02:08Z) — every other step in the publish
        # job already had this working-directory; Publish was missing it.
        working-directory: ${{ runner.temp }}/runtime-build
        env:
          # PYPI_TOKEN: repository secret scoped to molecule-ai-workspace-runtime.
          # Set via: Settings → Actions → Variables and Secrets → New Secret.
          # Format: pypi-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
        run: |
          if [ -z "$PYPI_TOKEN" ]; then
            echo "::error::PYPI_TOKEN secret is not set — set it at Settings → Actions → Variables and Secrets → New Secret."
            echo "::error::Required format: pypi-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
            exit 1
          fi
          python -m twine upload \
            --repository pypi \
            --username __token__ \
            --password "$PYPI_TOKEN" \
            dist/*
  cascade:
    needs: publish
    runs-on: ubuntu-latest
    steps:
      - name: Wait for PyPI to propagate the new version
        env:
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
          EXPECTED_SHA256: ${{ needs.publish.outputs.wheel_sha256 }}
        run: |
          set -eu
          if [ -z "$EXPECTED_SHA256" ]; then
            echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
            exit 1
          fi
          python -m venv /tmp/propagation-probe
          PROBE=/tmp/propagation-probe/bin
          $PROBE/pip install --upgrade --quiet pip
          for i in $(seq 1 30); do
            if $PROBE/pip install \
                  --quiet \
                  --no-cache-dir \
                  --force-reinstall \
                  --no-deps \
                  "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
                  >/dev/null 2>&1; then
              INSTALLED=$($PROBE/pip show molecule-ai-workspace-runtime 2>/dev/null \
                          | awk -F': ' '/^Version:/{print $2}')
              if [ "$INSTALLED" = "$RUNTIME_VERSION" ]; then
                echo "✓ PyPI resolved $RUNTIME_VERSION (install check)"
                break
              fi
            fi
            if [ $i -eq 30 ]; then
              echo "::error::pip install --no-cache-dir molecule-ai-workspace-runtime==${RUNTIME_VERSION} never resolved within ~5 min."
              echo "::error::Refusing to fan out cascade against a potentially stale PyPI index."
              exit 1
            fi
            echo "  [$i/30] waiting for PyPI to propagate ${RUNTIME_VERSION}..."
            sleep 4
          done
          # Stage (b): download wheel + SHA256 compare against what we built.
          # Catches Fastly stale-content serving old bytes under a new version URL.
          #
          # Caught run 5196 (first-ever successful publish, 2026-05-11): the
          # previous one-liner `HASH=$(pip download ... && sha256sum ...)`
          # captured pip's stdout (`Collecting molecule-ai-workspace-runtime
          # ==X.Y.Z`) into HASH, then the SHA comparison failed against the
          # leaked `Collecting...` string. `2>/dev/null` silences stderr but
          # NOT stdout; pip writes its progress to stdout by default.
          # Fix: split into two steps, silence pip's stdout explicitly, capture
          # only sha256sum's output into HASH.
          python -m pip download \
            --no-deps \
            --no-cache-dir \
            --dest /tmp/wheel-probe \
            --quiet \
            "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
            >/dev/null 2>&1
          HASH=$(sha256sum /tmp/wheel-probe/*.whl | awk '{print $1}')
          if [ "$HASH" != "$EXPECTED_SHA256" ]; then
            echo "::error::PyPI propagated $RUNTIME_VERSION but wheel content SHA256 mismatch."
            echo "::error::Expected: $EXPECTED_SHA256"
            echo "::error::Got:      $HASH"
            echo "::error::Fastly may be serving stale content. Refusing to fan out cascade."
            exit 1
          fi
          echo "✓ PyPI CDN verified (SHA256 match)"
      - name: Fan out via push to .runtime-version
        env:
          # Gitea PAT with write:repository scope on the 8 cascade-active
          # template repos. Used for git push to each template repo's main
          # branch, which trips their `on: push: branches: [main]` trigger
          # on publish-image.yml.
          DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
        run: |
          set +e   # don't abort on a single repo failure — collect them all
          if [ -z "$DISPATCH_TOKEN" ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
              echo "::warning::set it at Settings → Actions → Variables and Secrets → New Secret."
              exit 0
            fi
            echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
            echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version."
            exit 1
          fi
          VERSION="$RUNTIME_VERSION"
          if [ -z "$VERSION" ]; then
            echo "::error::publish job did not expose a version output"
            exit 1
          fi
          GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
          TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
          FAILED=""
          SKIPPED=""
          git config --global user.name  "publish-runtime cascade"
          git config --global user.email "publish-runtime@moleculesai.app"
          WORKDIR="$(mktemp -d)"
          for tpl in $TEMPLATES; do
            REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
            CLONE="$WORKDIR/$tpl"
            HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
              -H "Authorization: token $DISPATCH_TOKEN" \
              "$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
            if [ "$HTTP" = "404" ]; then
              echo "↷ $tpl has no publish-image.yml — soft-skip"
              SKIPPED="$SKIPPED $tpl"
              continue
            fi
            attempt=0
            success=false
            while [ $attempt -lt 3 ]; do
              attempt=$((attempt + 1))
              rm -rf "$CLONE"
              if ! git clone --depth=1 \
                  "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
                  "$CLONE" >/tmp/clone.log 2>&1; then
                echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
                sleep 2
                continue
              fi
              cd "$CLONE"
              echo "$VERSION" > .runtime-version
              if git diff --quiet -- .runtime-version; then
                echo "✓ $tpl already at $VERSION — no commit needed"
                success=true
                cd - >/dev/null
                break
              fi
              git add .runtime-version
              git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
                -m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
                >/dev/null
              if git push origin HEAD:main >/tmp/push.log 2>&1; then
                echo "✓ $tpl pushed $VERSION on attempt $attempt"
                success=true
                cd - >/dev/null
                break
              fi
              echo "::warning::push $tpl attempt $attempt failed, pull-rebasing"
              git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
              cd - >/dev/null
            done
            if [ "$success" != "true" ]; then
              FAILED="$FAILED $tpl"
            fi
          done
          rm -rf "$WORKDIR"
          if [ -n "$FAILED" ]; then
            echo "::error::Cascade incomplete after 3 retries each. Failed:$FAILED"
            exit 1
          fi
          if [ -n "$SKIPPED" ]; then
            echo "Cascade complete: pinned $VERSION. Soft-skipped (no publish-image.yml):$SKIPPED"
          else
            echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
          fi
--- a/.gitea/workflows/publish-workspace-server-image.yml
+++ b/.gitea/workflows/publish-workspace-server-image.yml
@ -0,0 +1,174 @@
 name: publish-workspace-server-image
 # Gitea Actions port of .github/workflows/publish-workspace-server-image.yml.
 #
 # Ported 2026-05-10 (issue #228). Key differences from the GitHub version:
 #   - Gitea Actions reads .gitea/workflows/, not .github/workflows/
 #   - Dropped `environment:` declarations — Gitea Actions does not support
 #     named environments (used by GitHub OIDC token gates)
 #   - Replaced `github.ref_name` (GitHub-only) with `${GITHUB_REF#refs/heads/}`
 #     — Gitea Actions exposes GITHUB_REF in the same format as GitHub Actions
 #   - docker/setup-buildx-action and aws-actions/configure-aws-credentials are
 #     GitHub Marketplace actions; they are installed by Gitea Actions runners and
 #     work identically here
 #   - All other variables (GITHUB_SHA, GITHUB_REPOSITORY, GITHUB_OUTPUT,
 #     secrets.*) use the same syntax as GitHub Actions
 #
 # Image tags produced:
 #   :staging-<sha> — per-commit digest, stable for canary verify
 #   :staging-latest — tracks most recent build on this branch
 #
 # ECR target: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*
 # Required secrets: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AUTO_SYNC_TOKEN
 on:
  push:
    branches: [main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
      - 'scripts/**'
      - '.gitea/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:
 # Serialize per-branch so two rapid staging pushes don't race the same
 # :staging-latest tag retag. Allow staging and main to run in parallel
 # (different GITHUB_REF → different concurrency group) since they
 # produce different :staging-<sha> tags and last-write-wins on
 # :staging-latest is acceptable across branches.
 #
 # cancel-in-progress: false → in-flight builds finish; the next push's
 # build queues. This avoids a partially-pushed image.
 concurrency:
  group: publish-workspace-server-image-${{ github.ref }}
  cancel-in-progress: false
 permissions:
  contents: read
  packages: write
 env:
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # Health check: verify Docker daemon is accessible before attempting any
      # build steps. This fails loudly at step 1 when the runner's docker.sock
      # is inaccessible (e.g. permission change, daemon restart, or group-membership
      # drift) rather than silently continuing to step 2 where `docker build`
      # fails deep in the process with a cryptic ECR auth error that doesn't
      # surface the root cause.  Also reports the daemon version so operator
      # can correlate with runner host logs.
      - name: Verify Docker daemon access
        run: |
          set -euo pipefail
          echo "::group::Docker daemon health check"
          docker info 2>&1 | head -5 || {
            echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
            echo "::error::Check: (1) daemon is running, (2) runner user is in docker group, (3) sock permissions are 660+"
            exit 1
          }
          echo "Docker daemon OK"
          echo "::endgroup::"
      # Pre-clone manifest deps before docker build.
      #
      # Why: workspace-template-* repos on Gitea are private. The pre-fix
      # Dockerfile.tenant ran `git clone` inside an in-image stage with no
      # auth path — every CI build failed. We clone in the trusted CI
      # context where AUTO_SYNC_TOKEN is available and Dockerfile.tenant
      # just COPYs from .tenant-bundle-deps/.
      #
      # Token: AUTO_SYNC_TOKEN is the devops-engineer persona PAT.
      # clone-manifest.sh embeds it as basic-auth for the clones, then
      # strips .git dirs — the token never enters the image.
      - name: Pre-clone manifest deps
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
            echo "::error::AUTO_SYNC_TOKEN secret is empty"
            exit 1
          fi
          mkdir -p .tenant-bundle-deps
          bash scripts/clone-manifest.sh \
            manifest.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      # Build + push platform image (inline ECR auth — mirrors the operator-host
      # approach; credentials come from GITHUB_SECRET_AWS_ACCESS_KEY_ID /
      # GITHUB_SECRET_AWS_SECRET_ACCESS_KEY in Gitea Actions).
      - name: Build & push platform image to ECR (staging-<sha> + staging-latest)
        env:
          IMAGE_NAME: ${{ env.IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          ECR_REGISTRY="${IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker build \
            --file ./workspace-server/Dockerfile \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.description=Molecule AI platform — pending canary verify" \
            --tag "${IMAGE_NAME}:${TAG_SHA}" \
            --tag "${IMAGE_NAME}:${TAG_LATEST}" \
            .
          docker push "${IMAGE_NAME}:${TAG_SHA}"
          docker push "${IMAGE_NAME}:${TAG_LATEST}"
      # Build + push tenant image (Go platform + Next.js canvas in one image).
      - name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
        env:
          TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker build \
            --file ./workspace-server/Dockerfile.tenant \
            --build-arg NEXT_PUBLIC_PLATFORM_URL= \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
            .
          docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}"
          docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"
--- a/.gitea/workflows/railway-pin-audit.yml
+++ b/.gitea/workflows/railway-pin-audit.yml
@ -0,0 +1,181 @@
 name: Railway pin audit (drift detection)
 # Ported from .github/workflows/railway-pin-audit.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `workflow_dispatch:` (Gitea 1.22.6 trigger handling).
 #     Manual runs go via cron-trigger bump or push the workflow file
 #     itself.
 #   - `actions/github-script@v9` blocks (which call github.rest.* — a
 #     GitHub-specific JS API) replaced with curl calls against the
 #     Gitea REST API (/api/v1/repos/.../issues, .../labels,
 #     .../comments). Same behaviour: open issue on drift, comment on
 #     repeat-drift, close on clean run.
 #   - Workflow-level env.GITHUB_SERVER_URL set so the curl calls can
 #     derive `git.moleculesai.app` from the runner env (with
 #     hard-coded fallback inside the steps).
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Daily audit of Railway env vars for drift-prone image-tag pins —
 # automation-cadence layer over the detection script + regression test
 # shipped in PR #2168 (#2001 closure).
 #
 # Background: on 2026-04-24 a stale `:staging-a14cf86` SHA pin in CP's
 # TENANT_IMAGE caused 3+ hours of E2E failure with the appearance that
 # "every fix didn't propagate" — really the tenant image was so old it
 # didn't read the env vars those fixes produced.
 #
 # Cadence: once a day, 13:00 UTC (06:00 PT).
 #
 # Secret hardening: per feedback_schedule_vs_dispatch_secrets_hardening,
 # the schedule trigger HARD-FAILS on missing RAILWAY_AUDIT_TOKEN.
 on:
  schedule:
    - cron: '0 13 * * *'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: railway-pin-audit
  cancel-in-progress: false
 permissions:
  issues: write
  contents: read
 jobs:
  audit:
    name: Audit Railway env vars for drift-prone pins
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 10
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify RAILWAY_AUDIT_TOKEN present
        env:
          RAILWAY_AUDIT_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        id: secret_check
        run: |
          set -euo pipefail
          if [ -n "${RAILWAY_AUDIT_TOKEN:-}" ]; then
            echo "have_secret=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          echo "have_secret=false" >> "$GITHUB_OUTPUT"
          echo "::error::RAILWAY_AUDIT_TOKEN secret missing — schedule trigger requires it. Provision the token (read-only \`variables\` scope on the molecule-platform Railway project) and store as repo secret RAILWAY_AUDIT_TOKEN."
          exit 1
      - name: Install Railway CLI
        if: steps.secret_check.outputs.have_secret == 'true'
        run: |
          set -euo pipefail
          curl -fsSL https://railway.com/install.sh | sh
          echo "$HOME/.railway/bin" >> "$GITHUB_PATH"
      - name: Verify Railway CLI authenticated
        if: steps.secret_check.outputs.have_secret == 'true'
        env:
          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        run: |
          set -euo pipefail
          if ! railway whoami >/dev/null 2>&1; then
            echo "::error::Railway CLI failed to authenticate with RAILWAY_AUDIT_TOKEN — token may be revoked or scoped incorrectly"
            exit 2
          fi
      - name: Link molecule-platform project
        if: steps.secret_check.outputs.have_secret == 'true'
        env:
          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        run: |
          set -euo pipefail
          railway link --project 7ccc8c68-61f4-42ab-9be5-586eeee11768
      - name: Run drift audit
        if: steps.secret_check.outputs.have_secret == 'true'
        id: audit
        env:
          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
        run: |
          set +e
          bash scripts/ops/audit-railway-sha-pins.sh 2>&1 | tee /tmp/audit.log
          rc=${PIPESTATUS[0]}
          echo "rc=$rc" >> "$GITHUB_OUTPUT"
          # Capture the audit log for the issue body.
          {
            echo 'log<<AUDIT_EOF'
            cat /tmp/audit.log
            echo 'AUDIT_EOF'
          } >> "$GITHUB_OUTPUT"
          case "$rc" in
            0) exit 0 ;;
            1) echo "::warning::Drift-prone pin(s) detected — issue will be filed"; exit 1 ;;
            2) echo "::error::Railway CLI auth/link failed mid-script — token or project ID drift"; exit 2 ;;
            *) echo "::error::Unexpected audit rc=$rc"; exit 1 ;;
          esac
      - name: Open / update drift issue (Gitea API)
        if: failure() && steps.audit.outputs.rc == '1'
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          AUDIT_LOG: ${{ steps.audit.outputs.log }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          TITLE="Railway env-var drift detected"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          BODY=$(jq -nc --arg t "$TITLE" --arg log "${AUDIT_LOG:-(log unavailable)}" --arg run "$RUN_URL" '
            {body: ("Daily Railway pin audit found drift-prone image-tag pins in the molecule-platform Railway project.\n\n**What this means:** an env var (likely on `controlplane`) is pinned to a SHA-shaped or semver tag instead of a floating tag. Same pattern that caused the 2026-04-24 TENANT_IMAGE incident — fix-PRs land but the running service does not pick them up.\n\n**Recovery:** open the Railway dashboard, replace the flagged value with a floating tag (:staging-latest, :main) unless the pin is intentional and documented in the ops runbook.\n\n**Audit output:**\n\n```\n" + $log + "\n```\n\nRun: " + $run + "\n\nCloses automatically when a subsequent daily run reports clean.")}')
          # Look for existing open drift issue with the title.
          EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
          if [ -n "$EXISTING" ]; then
            COMMENT_BODY=$(jq -nc --arg log "${AUDIT_LOG:-(log unavailable)}" --arg run "$RUN_URL" \
              '{body: ("Still drifting. " + $run + "\n\n```\n" + $log + "\n```")}')
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${EXISTING}/comments" -d "$COMMENT_BODY" >/dev/null
            echo "Commented on existing issue #${EXISTING}"
          else
            CREATE_BODY=$(echo "$BODY" | jq --arg t "$TITLE" '. + {title: $t, labels: []}')
            NUM=$(curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues" -d "$CREATE_BODY" | jq -r .number)
            echo "Filed issue #${NUM}"
          fi
      - name: Close stale drift issue on clean run (Gitea API)
        if: success() && steps.audit.outputs.rc == '0'
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          TITLE="Railway env-var drift detected"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          NUMS=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number')
          for N in $NUMS; do
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}/comments" \
              -d "$(jq -nc --arg run "$RUN_URL" '{body: ("Daily audit clean — drift resolved. " + $run)}')" >/dev/null
            curl -fsS -X PATCH -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}" -d '{"state":"closed"}' >/dev/null
            echo "Closed #${N}"
          done
--- a/.gitea/workflows/redeploy-tenants-on-main.yml
+++ b/.gitea/workflows/redeploy-tenants-on-main.yml
@ -0,0 +1,375 @@
 name: redeploy-tenants-on-main
 # Ported from .github/workflows/redeploy-tenants-on-main.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support
 #     for the `workflow_run` event is partial. If this never fires on a
 #     real publish-workspace-server-image completion, the follow-up
 #     triage PR should replace the trigger with a push-with-paths-filter
 #     on .gitea/workflows/publish-workspace-server-image.yml. Until
 #     then continue-on-error+dead-workflow doesn't break anything.
 #
 # Auto-refresh prod tenant EC2s after every main merge.
 #
 # Why this workflow exists: publish-workspace-server-image builds and
 # pushes a new platform-tenant :<sha> to ECR on every merge to main,
 # but running tenants pulled their image once at boot and never re-pull.
 # Users see stale code indefinitely.
 #
 # This workflow closes the gap by calling the control-plane admin
 # endpoint that performs a canary-first, batched, health-gated rolling
 # redeploy across every live tenant. Implemented in molecule-ai/
 # molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
 # (feat/tenant-auto-redeploy, landing alongside this workflow).
 #
 # Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/
 # molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the
 # Gitea suspension migration. The staging-verify.yml promote step now
 # uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap).
 #
 # Runtime ordering:
 #   1. publish-workspace-server-image completes → new :staging-<sha> in ECR.
 #   2. This workflow fires via workflow_run, calls redeploy-fleet with
 #      target_tag=staging-<sha>. No CDN propagation wait needed —
 #      ECR image manifest is consistent immediately after push.
 #   3. Calls redeploy-fleet with canary_slug (if set) and a soak
 #      period. Canary proves the image boots; batches follow.
 #   4. Any failure aborts the rollout and leaves older tenants on the
 #      prior image — safer default than half-and-half state.
 #
 # Rollback path: re-run this workflow with a specific SHA pinned via
 # the workflow_dispatch input. That calls redeploy-fleet with
 # target_tag=<sha>, re-pulling the older image on every tenant.
 on:
  workflow_run:
    workflows: ['publish-workspace-server-image']
    types: [completed]
    branches: [main]
 permissions:
  contents: read
  # No write scopes needed — the workflow hits an external CP endpoint,
  # not the GitHub API.
 # Serialize redeploys so two rapid main pushes' redeploys don't overlap
 # and cause confusing per-tenant SSM state. Without this, GitHub's
 # implicit workflow_run queueing would *probably* serialize them, but
 # the explicit block makes the invariant defensible. Mirrors the
 # concurrency block on redeploy-tenants-on-staging.yml for shape parity.
 #
 # cancel-in-progress: false → aborting a half-rolled-out fleet would
 # leave tenants stuck on whatever image they happened to be on when
 # cancelled. Better to finish the in-flight rollout before starting
 # the next one.
 concurrency:
  group: redeploy-tenants-on-main
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  redeploy:
    # Skip the auto-trigger if publish-workspace-server-image didn't
    # actually succeed. workflow_run fires on any completion state; we
    # don't want to redeploy against a half-built image.
    # NOTE (Gitea port): workflow_dispatch trigger dropped; only the
    # workflow_run path remains.
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 25
    steps:
      - name: Note on ECR propagation
        # ECR image manifests are consistent immediately after push — no
        # CDN cache to wait for. The old GHCR-based workflow had a 30s
        # sleep to avoid race conditions; ECR makes that unnecessary.
        run: echo "ECR image available immediately after push — proceeding."
      - name: Compute target tag
        id: tag
        # Resolution order:
        #   1. Operator-supplied input (workflow_dispatch with explicit
        #      tag) → used verbatim. Lets ops pin `latest` for emergency
        #      rollback to last canary-verified digest, or pin a specific
        #      `staging-<sha>` to roll back to a known-good build.
        #   2. Default → `staging-<short_head_sha>`. The just-published
        #      digest. Bypasses the `:latest` retag path that's currently
        #      dead (staging-verify soft-skips without canary fleet, so
        #      the only thing retagging `:latest` today is the manual
        #      promote-latest.yml — last run 2026-04-28). Auto-trigger
        #      from workflow_run uses workflow_run.head_sha; manual
        #      dispatch with no input falls through to github.sha.
        env:
          INPUT_TAG: ${{ inputs.target_tag }}
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
        run: |
          set -euo pipefail
          if [ -n "${INPUT_TAG:-}" ]; then
            echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT"
            echo "Using operator-pinned tag: $INPUT_TAG"
          else
            SHORT="${HEAD_SHA:0:7}"
            echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT"
            echo "Using auto tag: staging-$SHORT (head_sha=$HEAD_SHA)"
          fi
      - name: Call CP redeploy-fleet
        # CP_ADMIN_API_TOKEN must be set as a repo/org secret on
        # molecule-ai/molecule-core, matching the staging/prod CP's
        # CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
        # repo's secrets for CI.
        env:
          CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
          CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
          TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
          CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }}
          SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
          BATCH_SIZE: ${{ inputs.batch_size || '3' }}
          DRY_RUN: ${{ inputs.dry_run || false }}
        run: |
          set -euo pipefail
          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
            echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
            echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
            exit 1
          fi
          BODY=$(jq -nc \
            --arg tag "$TARGET_TAG" \
            --arg canary "$CANARY_SLUG" \
            --argjson soak "$SOAK_SECONDS" \
            --argjson batch "$BATCH_SIZE" \
            --argjson dry "$DRY_RUN" \
            '{
              target_tag: $tag,
              canary_slug: $canary,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          # Route -w into its own tempfile so curl's exit code (e.g. 56
          # on connection-reset, 22 on --fail-with-body 4xx/5xx) can't
          # pollute the captured stdout. The previous inline-substitution
          # shape produced "000000" on connection reset (curl wrote
          # "000" via -w, then the inline echo-fallback appended another
          # "000") — caught on the 2026-05-04 redeploy of sha 2b862f6.
          # set +e/-e keeps the non-zero curl exit from tripping the
          # outer pipeline. See lint-curl-status-capture.yml for the
          # CI gate that pins this fix shape.
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          set -e
          # Stderr from curl (e.g. dial errors with -sS) goes to the runner
          # log so operators can see WHY a connection failed. Stdout is
          # captured to $HTTP_CODE_FILE because that's where -w writes.
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          # Pretty-print per-tenant results in the job summary so
          # ops can see which tenants were redeployed without drilling
          # into the raw response.
          {
            echo "## Tenant redeploy fleet"
            echo ""
            echo "**Target tag:** \`$TARGET_TAG\`"
            echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
            echo "**Batch size:** $BATCH_SIZE"
            echo "**Dry run:** $DRY_RUN"
            echo "**HTTP:** $HTTP_CODE"
            echo ""
            echo "### Per-tenant result"
            echo ""
            echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
            echo '|------|-------|------------|------|---------|-------|'
            jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
          } >> "$GITHUB_STEP_SUMMARY"
          if [ "$HTTP_CODE" != "200" ]; then
            echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
            exit 1
          fi
          OK=$(jq -r '.ok' "$HTTP_RESPONSE")
          if [ "$OK" != "true" ]; then
            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
            exit 1
          fi
          echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
          # Stash the response for the verify step. $RUNNER_TEMP outlasts
          # the step boundary; $HTTP_RESPONSE doesn't.
          cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
      - name: Verify each tenant /buildinfo matches published SHA
        # ROOT FIX FOR #2395.
        #
        # `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC
        # didn't error" — NOT "the new image is running on the tenant."
        # `:latest` lives in the local Docker daemon's image cache; if
        # the SSM document does `docker compose up -d` without an
        # explicit `docker pull`, the daemon serves the previously-
        # cached digest and the container restarts on stale code.
        # 2026-04-30 incident: hongmingwang's tenant reported
        # ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7
        # chat_files for 30+ min — the lazy-heal fix never reached the
        # user despite green deploy + green redeploy.
        #
        # This step closes the gap by curling each tenant's /buildinfo
        # endpoint (added in workspace-server/internal/buildinfo +
        # /Dockerfile* GIT_SHA build-arg, this PR) and comparing the
        # returned git_sha to the SHA the workflow expects. Mismatches
        # fail the workflow, which is what `ok=true` should have
        # guaranteed all along.
        #
        # When the redeploy was triggered by workflow_dispatch with a
        # specific tag (target_tag != "latest"), the expected SHA may
        # not equal ${{ github.sha }} — in that case we resolve via
        # GHCR's manifest. For workflow_run (default :latest) the
        # workflow_run.head_sha is the SHA that just published.
        env:
          EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
          # Tenant subdomain template — slugs from the response are
          # appended. Production CP issues `<slug>.moleculesai.app`;
          # staging CP issues `<slug>.staging.moleculesai.app`. This
          # workflow runs on main → prod CP → no `staging.` infix.
          TENANT_DOMAIN: 'moleculesai.app'
        run: |
          set -euo pipefail
          EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
          if [ "$TARGET_TAG" != "latest" ] \
             && [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \
             && [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then
            # workflow_dispatch with a pinned tag that isn't the head
            # SHA — operator is rolling back / pinning. Skip the
            # verification because we don't have the expected SHA in
            # this context (would need to crane-inspect the GHCR
            # manifest, which is a follow-up). Failing-open here is
            # safe: the operator chose the tag deliberately.
            #
            # `staging-<short_head_sha>` IS verified — it's the new
            # auto-trigger default (see Compute target tag step) and
            # the digest under that tag SHOULD match EXPECTED_SHA.
            echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
            exit 0
          fi
          RESP="$RUNNER_TEMP/redeploy-response.json"
          if [ ! -s "$RESP" ]; then
            echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read"
            exit 1
          fi
          # Pull only successfully-redeployed tenants. Any tenant that
          # halted the rollout already failed the previous step, so we
          # don't double-count them here.
          mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
          if [ ${#SLUGS[@]} -eq 0 ]; then
            echo "::warning::No tenants reported healthz_ok — nothing to verify"
            exit 0
          fi
          echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
          # Two distinct failure modes — STALE (the #2395 bug class, hard-fail)
          # vs UNREACHABLE (teardown race, soft-warn). See the staging variant's
          # comment for the full rationale; same logic applies on prod even
          # though prod has fewer ephemeral tenants — the asymmetry would be a
          # gratuitous fork.
          STALE_COUNT=0
          UNREACHABLE_COUNT=0
          STALE_LINES=()
          UNREACHABLE_LINES=()
          for slug in "${SLUGS[@]}"; do
            URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
            # 30s total: tenant just SSM-restarted, may still be coming
            # up. Retry-on-empty rather than retry-on-status — we want
            # to fail fast on "responded with wrong SHA", not "still
            # warming up".
            BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
            ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
            if [ -z "$ACTUAL_SHA" ]; then
              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
              UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
              continue
            fi
            if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
              echo "  $slug: ${ACTUAL_SHA:0:7} ✓"
            else
              STALE_COUNT=$((STALE_COUNT + 1))
              STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
            fi
          done
          {
            echo ""
            echo "### Per-tenant /buildinfo verification"
            echo ""
            echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
            echo ""
            if [ $STALE_COUNT -gt 0 ]; then
              echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${STALE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $UNREACHABLE_COUNT -gt 0 ]; then
              echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely teardown race (soft-warn, not failing):**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
              echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓"
            fi
          } >> "$GITHUB_STEP_SUMMARY"
          if [ $UNREACHABLE_COUNT -gt 0 ]; then
            echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
          fi
          # Belt-and-suspenders sanity floor: same logic as the staging
          # variant — see that file's comment for the full rationale.
          # Floor only applies when fleet >= 4; below that, staging-verify
          # is the actual gate.
          TOTAL_VERIFIED=${#SLUGS[@]}
          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
            exit 1
          fi
          if [ $STALE_COUNT -gt 0 ]; then
            echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
            exit 1
          fi
          echo "::notice::Tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."
--- a/.gitea/workflows/redeploy-tenants-on-staging.yml
+++ b/.gitea/workflows/redeploy-tenants-on-staging.yml
@ -0,0 +1,356 @@
 name: redeploy-tenants-on-staging
 # Ported from .github/workflows/redeploy-tenants-on-staging.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support
 #     for the `workflow_run` event is partial. If this never fires on a
 #     real publish-workspace-server-image completion, the follow-up
 #     triage PR should replace the trigger with a push-with-paths-filter
 #     on .gitea/workflows/publish-workspace-server-image.yml. Until
 #     then continue-on-error+dead-workflow doesn't break anything.
 #
 # Auto-refresh staging tenant EC2s after every staging-branch merge.
 #
 # Mirror of redeploy-tenants-on-main.yml, with the staging-CP host and
 # the :staging-latest tag. Sister workflow exists for prod (rolls
 # :latest after staging-verify). Both share the same shape — just
 # different CP_URL + target_tag + admin token secret.
 #
 # Why this workflow exists: publish-workspace-server-image now builds
 # on every staging-branch push (PR #2335), pushing
 # platform-tenant:staging-latest to GHCR. Existing tenants pulled
 # their image once at boot and never re-pull, so the new image just
 # sits unused until the tenant is reprovisioned.
 #
 # This workflow closes the gap by calling staging-CP's
 # /cp/admin/tenants/redeploy-fleet, which performs a canary-first,
 # batched, health-gated SSM redeploy across every live staging tenant.
 # Same endpoint shape as prod CP — only the host differs.
 #
 # Runtime ordering:
 #   1. publish-workspace-server-image completes on staging branch →
 #      new :staging-latest in GHCR.
 #   2. This workflow fires via workflow_run, waits 30s for GHCR's CDN
 #      to propagate the new tag.
 #   3. Calls redeploy-fleet with no canary (staging IS canary; we don't
 #      need a sub-canary inside it). Soak still applies to the first
 #      tenant in case of bad-deploy detection.
 #   4. Any failure aborts the rollout and leaves older tenants on the
 #      prior image — safer default than half-and-half state.
 #
 # Rollback path: re-run with workflow_dispatch + target_tag=staging-<sha>
 # of a known-good build.
 on:
  workflow_run:
    workflows: ['publish-workspace-server-image']
    types: [completed]
    branches: [main]
 permissions:
  contents: read
  # No write scopes needed — the workflow hits an external CP endpoint,
  # not the GitHub API.
 # Serialize per-branch so two rapid staging pushes' redeploys don't
 # overlap and cause confusing per-tenant SSM state. cancel-in-progress
 # is false because aborting a half-rolled-out fleet leaves tenants
 # stuck on whatever image they happened to be on when cancelled.
 concurrency:
  group: redeploy-tenants-on-staging
  cancel-in-progress: false
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  redeploy:
    # Skip the auto-trigger if publish-workspace-server-image didn't
    # actually succeed. workflow_run fires on any completion state; we
    # don't want to redeploy against a half-built image.
    # NOTE (Gitea port): workflow_dispatch trigger dropped; only the
    # workflow_run path remains.
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 25
    steps:
      - name: Wait for GHCR tag propagation
        # GHCR's edge cache takes ~15-30s to consistently serve the new
        # :staging-latest manifest after the registry accepts the push.
        # Same rationale as redeploy-tenants-on-main.yml.
        run: sleep 30
      - name: Call staging-CP redeploy-fleet
        # CP_STAGING_ADMIN_API_TOKEN must be set as a repo/org secret
        # on molecule-ai/molecule-core, matching staging-CP's
        # CP_ADMIN_API_TOKEN env var (visible in Railway controlplane
        # / staging environment). Stored separately from the prod
        # CP_ADMIN_API_TOKEN so a leak of one doesn't auth the other.
        env:
          CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
          CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
          TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
          CANARY_SLUG: ${{ inputs.canary_slug || '' }}
          SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
          BATCH_SIZE: ${{ inputs.batch_size || '3' }}
          DRY_RUN: ${{ inputs.dry_run || false }}
        run: |
          set -euo pipefail
          # Schedule-vs-dispatch hardening (mirrors sweep-cf-orphans
          # and sweep-cf-tunnels): hard-fail on auto-trigger when the
          # secret is missing so a misconfigured-repo doesn't silently
          # serve stale staging tenants. Soft-skip on operator dispatch.
          if [ -z "${CP_STAGING_ADMIN_API_TOKEN:-}" ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::CP_STAGING_ADMIN_API_TOKEN secret not set — skipping redeploy"
              echo "::warning::Set CP_STAGING_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
              echo "::notice::Pull the value from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
              exit 0
            fi
            echo "::error::staging redeploy cannot run — CP_STAGING_ADMIN_API_TOKEN secret missing"
            echo "::error::set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
            exit 1
          fi
          BODY=$(jq -nc \
            --arg tag "$TARGET_TAG" \
            --arg canary "$CANARY_SLUG" \
            --argjson soak "$SOAK_SECONDS" \
            --argjson batch "$BATCH_SIZE" \
            --argjson dry "$DRY_RUN" \
            '{
              target_tag: $tag,
              canary_slug: $canary,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          # Route -w into its own tempfile so curl's exit code (e.g. 56
          # on connection-reset) can't pollute the captured stdout. The
          # previous inline-substitution shape produced "000000" on
          # connection reset — caught on main variant 2026-05-04
          # redeploying sha 2b862f6. Same fix shape as the synth-E2E
          # §9c gate (PR #2797). See lint-curl-status-capture.yml for
          # the CI gate that pins this fix shape.
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          set -e
          # Stderr from curl (-sS shows dial errors etc.) goes to the
          # runner log so operators can see WHY a connection failed.
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          {
            echo "## Staging tenant redeploy fleet"
            echo ""
            echo "**Target tag:** \`$TARGET_TAG\`"
            echo "**Canary:** \`${CANARY_SLUG:-(none — staging is itself the canary)}\` (soak ${SOAK_SECONDS}s)"
            echo "**Batch size:** $BATCH_SIZE"
            echo "**Dry run:** $DRY_RUN"
            echo "**HTTP:** $HTTP_CODE"
            echo ""
            echo "### Per-tenant result"
            echo ""
            echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
            echo '|------|-------|------------|------|---------|-------|'
            jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
          } >> "$GITHUB_STEP_SUMMARY"
          # Distinguish "real fleet failure" from "E2E teardown race".
          #
          # CP returns HTTP 500 + ok=false whenever ANY tenant in the
          # fleet failed SSM or healthz. In practice the recurring source
          # of these is ephemeral test tenants being torn down by their
          # parent E2E run mid-redeploy: the EC2 dies → SSM exit=2 or
          # healthz timeout → CP marks the fleet failed → this workflow
          # goes red even though every operator-facing tenant rolled fine.
          #
          # Ephemeral slug prefixes (kept in sync with sweep-stale-e2e-orgs.yml
          # — see that file for the source-of-truth list and rationale):
          #   - e2e-*       — canvas/saas/ext E2E suites
          #   - rt-e2e-*    — runtime-test harness fixtures (RFC #2251)
          # Long-lived prefixes that are NOT ephemeral and MUST hard-fail:
          # demo-prep, dryrun-*, dryrun2-*, plus all human tenant slugs.
          #
          # Filter: if HTTP=500/ok=false AND every failed slug matches an
          # ephemeral prefix, treat as soft-warn and let the verify step
          # downstream handle unreachable-vs-stale (#2402). Any non-ephemeral
          # failure or a non-500 HTTP response remains a hard failure.
          OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE")
          FAILED_SLUGS=$(jq -r '
            .results[]?
            | select((.healthz_ok != true) or (.ssm_status != "Success"))
            | .slug' "$HTTP_RESPONSE" 2>/dev/null || true)
          EPHEMERAL_PREFIX_RE='^(e2e-|rt-e2e-)'
          NON_EPHEMERAL_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -Ev "$EPHEMERAL_PREFIX_RE" || true)
          if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then
            : # happy path — fall through to verification
          elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_EPHEMERAL_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then
            COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -Ec "$EPHEMERAL_PREFIX_RE" || true)
            echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is ephemeral (e2e-*/rt-e2e-*) — treating as teardown race, soft-warning."
            printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning::  failed: /'
          elif [ "$HTTP_CODE" != "200" ]; then
            echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
            if [ -n "$NON_EPHEMERAL_FAILED" ]; then
              echo "::error::non-ephemeral tenant(s) failed:"
              printf '%s\n' "$NON_EPHEMERAL_FAILED" | sed 's/^/::error::  /'
            fi
            exit 1
          else
            # HTTP=200 but ok=false (shouldn't happen with current CP
            # but keep the gate for completeness).
            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
            exit 1
          fi
          echo "::notice::Staging tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
          cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
      - name: Verify each staging tenant /buildinfo matches published SHA
        # Mirror of the verify step in redeploy-tenants-on-main.yml — see
        # there for the rationale (#2395 root fix). Staging has the same
        # ssm_status-success-but-stale-image hazard and benefits from the
        # same gate. Diff: TENANT_DOMAIN includes the `staging.` infix.
        env:
          EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
          TENANT_DOMAIN: 'staging.moleculesai.app'
        run: |
          set -euo pipefail
          # staging-latest is the staging-side moving tag; treat it the
          # same way main treats `latest`. Operator-pinned SHAs skip
          # verification (see main variant for why).
          if [ "$TARGET_TAG" != "staging-latest" ] && [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then
            echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
            exit 0
          fi
          RESP="$RUNNER_TEMP/redeploy-response.json"
          if [ ! -s "$RESP" ]; then
            echo "::error::redeploy-response.json missing or empty"
            exit 1
          fi
          mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
          if [ ${#SLUGS[@]} -eq 0 ]; then
            echo "::warning::No staging tenants reported healthz_ok — nothing to verify"
            exit 0
          fi
          echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
          # Two distinct failure modes here:
          #   STALE_COUNT      — tenant returned a SHA that doesn't match. THIS is
          #                      the #2395 bug class: tenant up + serving old code.
          #                      Always hard-fail the workflow.
          #   UNREACHABLE_COUNT — tenant didn't respond. Almost always a benign
          #                      teardown race: redeploy-fleet snapshot says
          #                      healthz_ok=true, then the E2E suite tears the
          #                      ephemeral tenant down before this step runs (the
          #                      e2e-* fixtures churn 5-10/hour on staging). Soft-
          #                      warn so we don't block staging→main on cleanup.
          #                      Real "tenant up but unreachable" is caught by CP's
          #                      own healthz monitor + the post-redeploy alert; we
          #                      don't need to double-count it here.
          STALE_COUNT=0
          UNREACHABLE_COUNT=0
          STALE_LINES=()
          UNREACHABLE_LINES=()
          for slug in "${SLUGS[@]}"; do
            URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
            BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
            ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
            if [ -z "$ACTUAL_SHA" ]; then
              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
              UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
              continue
            fi
            if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
              echo "  $slug: ${ACTUAL_SHA:0:7} ✓"
            else
              STALE_COUNT=$((STALE_COUNT + 1))
              STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
            fi
          done
          {
            echo ""
            echo "### Per-tenant /buildinfo verification (staging)"
            echo ""
            echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
            echo ""
            if [ $STALE_COUNT -gt 0 ]; then
              echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${STALE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $UNREACHABLE_COUNT -gt 0 ]; then
              echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely E2E teardown race (soft-warn, not failing):**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
              echo "All ${#SLUGS[@]} staging tenants returned matching SHA. ✓"
            fi
          } >> "$GITHUB_STEP_SUMMARY"
          if [ $UNREACHABLE_COUNT -gt 0 ]; then
            echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
          fi
          # Belt-and-suspenders sanity floor: if MORE than half the fleet is
          # unreachable AND the fleet is large enough that "half down" is
          # statistically meaningful, this is a real outage (e.g. new image
          # crashes on startup), not a teardown race. Hard-fail.
          #
          # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the
          # staging-verify step is the actual gate for "all tenants down"
          # detection (it runs against the canary first and aborts the
          # rollout if the canary fails to come up). Without the >=4 gate,
          # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a
          # quiet staging push) would re-flake on the exact teardown-race
          # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail.
          TOTAL_VERIFIED=${#SLUGS[@]}
          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
            exit 1
          fi
          if [ $STALE_COUNT -gt 0 ]; then
            echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
            exit 1
          fi
          echo "::notice::Staging tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."
--- a/.gitea/workflows/runtime-pin-compat.yml
+++ b/.gitea/workflows/runtime-pin-compat.yml
@ -0,0 +1,100 @@
 name: Runtime Pin Compatibility
 # Ported from .github/workflows/runtime-pin-compat.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group:` (no Gitea merge queue) and
 #     `workflow_dispatch:` (no inputs, but the trigger itself is
 #     parser-rejected when inputs are absent in some Gitea 1.22.x
 #     builds; safest to drop entirely — manual runs go via cron-trigger
 #     bump or push-with-paths-filter).
 #   - on.paths references .gitea/workflows/runtime-pin-compat.yml (this
 #     file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # CI gate that prevents the 5-hour staging outage from 2026-04-24 from
 # recurring (controlplane#253). The original failure mode:
 #   1. molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its
 #      requires_dist metadata (incorrect — it actually imports
 #      a2a.server.routes which only exists in a2a-sdk 1.0+)
 #   2. `pip install molecule-ai-workspace-runtime` resolved cleanly
 #   3. `from molecule_runtime.main import main_sync` raised ImportError
 #   4. Every tenant workspace crashed; the canary tenant caught it but
 #      only after 5 hours of degraded staging
 #
 # This workflow installs the CURRENTLY PUBLISHED runtime from PyPI on
 # top of `workspace/requirements.txt` and smoke-imports. Catches:
 #   - Upstream PyPI yanks
 #   - Bad re-releases of molecule-ai-workspace-runtime
 #   - Already-shipped wheels that stop importing because a transitive
 #     dep moved underneath
 on:
  push:
    branches: [main, staging]
    paths:
      # Narrow filter: pypi-latest is sensitive only to changes that
      # affect what we're INSTALLING (requirements.txt) or WHAT THE
      # CHECK ITSELF DOES (this workflow file). Edits to workspace/
      # source code don't change what's on PyPI right now, so they
      # don't change this gate's verdict.
      - 'workspace/requirements.txt'
      - '.gitea/workflows/runtime-pin-compat.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - 'workspace/requirements.txt'
      - '.gitea/workflows/runtime-pin-compat.yml'
  # Daily catch for upstream PyPI publishes that break the pin combo
  # without any change in our repo (e.g. someone re-yanks an a2a-sdk
  # release or molecule-ai-workspace-runtime publishes a bad bump).
  schedule:
    - cron: '0 13 * * *'  # 06:00 PT
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  pypi-latest-install:
    name: PyPI-latest install + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - name: Install runtime + workspace requirements
        # Install order is load-bearing: install the runtime FIRST so pip
        # honors whatever a2a-sdk constraint the runtime metadata declares
        # (this is the surface that broke in 2026-04-24 — runtime declared
        # `a2a-sdk<1.0` but actually needed >=1.0). The follow-up install
        # of workspace/requirements.txt then upgrades a2a-sdk to the
        # constraint our runtime image actually pins. The import smoke
        # below verifies the upgraded combination is consistent.
        run: |
          python -m venv /tmp/venv
          /tmp/venv/bin/pip install --upgrade pip
          /tmp/venv/bin/pip install molecule-ai-workspace-runtime
          /tmp/venv/bin/pip install -r workspace/requirements.txt
          /tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
            | grep -E '^(Name|Version):'
      - name: Smoke import — fail if metadata declares deps that don't satisfy real imports
        # WORKSPACE_ID is validated at import time by platform_auth.py — EC2
        # user-data sets it from the cloud-init template; set a placeholder
        # here so the import smoke doesn't trip on the env-var guard.
        env:
          WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
        run: |
          /tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')"
--- a/.gitea/workflows/runtime-prbuild-compat.yml
+++ b/.gitea/workflows/runtime-prbuild-compat.yml
@ -0,0 +1,139 @@
 name: Runtime PR-Built Compatibility
 # Ported from .github/workflows/runtime-prbuild-compat.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group:` (no Gitea merge queue) and `workflow_dispatch:`
 #     (Gitea 1.22.6 parser-rejects workflow_dispatch with inputs and is
 #     finicky without them).
 #   - `dorny/paths-filter@v4` replaced with inline `git diff` (per PR#372
 #     pattern for ci.yml port).
 #   - on.paths references .gitea/workflows/runtime-prbuild-compat.yml.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on every job (RFC §1 contract).
 #
 # Companion to `runtime-pin-compat.yml`. That workflow tests what's
 # CURRENTLY PUBLISHED on PyPI; this workflow tests what WOULD BE
 # PUBLISHED if THIS PR merges.
 #
 # Why two workflows: the chicken-and-egg #128 fix added a "PR-built
 # wheel" job to the original runtime-pin-compat.yml, but both jobs
 # shared a `paths:` filter that was the union of their needs
 # (`workspace/**`). That meant the PyPI-latest job ran on every doc
 # edit even though the upstream PyPI artifact can't change with our
 # workspace/ source. Splitting the two means each gets a narrow
 # `paths:` filter that matches the inputs it actually depends on.
 #
 # Catches the failure mode where a PR adds an import requiring a newer
 # SDK than `workspace/requirements.txt` pins:
 #   1. Pip resolves the existing PyPI wheel + the old SDK pin -> smoke
 #      passes (it imports the OLD main.py from the wheel, not the PR's
 #      new main.py).
 #   2. Merge -> publish-runtime.yml ships a wheel WITH the new import.
 #   3. Tenant images redeploy -> all crash on first boot with ImportError.
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  # event_name + sha keeps PR sync and the subsequent staging push on the
  # same SHA from cancelling each other (per feedback_concurrency_group_per_sha).
  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: true
 jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      wheel: ${{ steps.decide.outputs.wheel }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
      - id: decide
        run: |
          # Inline replacement for dorny/paths-filter — same pattern
          # PR#372's ci.yml port used. Diffs against the PR base or the
          # previous push SHA, then matches against the wheel-relevant
          # path set.
          BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
          if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          fi
          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
            # New branch or no previous SHA: treat as wheel-relevant.
            echo "wheel=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            git fetch --depth=1 origin "$BASE" 2>/dev/null || true
          fi
          if ! git cat-file -e "$BASE" 2>/dev/null; then
            echo "wheel=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          CHANGED=$(git diff --name-only "$BASE" HEAD)
          if echo "$CHANGED" | grep -qE '^(workspace/|scripts/build_runtime_package\.py$|scripts/wheel_smoke\.py$|\.gitea/workflows/runtime-prbuild-compat\.yml$)'; then
            echo "wheel=true" >> "$GITHUB_OUTPUT"
          else
            echo "wheel=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job (no job-level `if:`) that always runs and reports under the
  # required-check name `PR-built wheel + import smoke`. Real work is
  # gated per-step on `needs.detect-changes.outputs.wheel`.
  local-build-install:
    needs: detect-changes
    name: PR-built wheel + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.wheel != 'true'
        run: |
          echo "No workspace/ / scripts/{build_runtime_package,wheel_smoke}.py / workflow changes — wheel gate satisfied without rebuilding."
          echo "::notice::PR-built wheel + import smoke no-op pass (paths filter excluded this commit)."
      - if: needs.detect-changes.outputs.wheel == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - if: needs.detect-changes.outputs.wheel == 'true'
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - name: Install build tooling
        if: needs.detect-changes.outputs.wheel == 'true'
        run: pip install build
      - name: Build wheel from PR source (mirrors publish-runtime.yml)
        if: needs.detect-changes.outputs.wheel == 'true'
        # Use a fixed test version so the wheel filename is predictable.
        # Doesn't reach PyPI — this build is local-only for the smoke.
        run: |
          python scripts/build_runtime_package.py \
            --version "0.0.0.dev0+pin-compat" \
            --out /tmp/runtime-build
          cd /tmp/runtime-build && python -m build
      - name: Install built wheel + workspace requirements
        if: needs.detect-changes.outputs.wheel == 'true'
        run: |
          python -m venv /tmp/venv-built
          /tmp/venv-built/bin/pip install --upgrade pip
          /tmp/venv-built/bin/pip install /tmp/runtime-build/dist/*.whl
          /tmp/venv-built/bin/pip install -r workspace/requirements.txt
          /tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
            | grep -E '^(Name|Version):'
      - name: Smoke import the PR-built wheel
        if: needs.detect-changes.outputs.wheel == 'true'
        # Same script publish-runtime.yml runs against the to-be-PyPI wheel.
        run: |
          /tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
--- a/.gitea/workflows/secret-pattern-drift.yml
+++ b/.gitea/workflows/secret-pattern-drift.yml
@ -0,0 +1,70 @@
 name: SECRET_PATTERNS drift lint
 # Ported from .github/workflows/secret-pattern-drift.yml on 2026-05-11
 # per RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - on.paths references the new canonical .gitea/workflows/secret-scan.yml
 #     (the .github/ copy is removed by Cat A of this sweep).
 #   - CANONICAL_FILE inside scripts/lint_secret_pattern_drift.py was
 #     updated in the same Cat C-1 PR to point at .gitea/workflows/secret-scan.yml.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Detects when the canonical SECRET_PATTERNS array in
 # .gitea/workflows/secret-scan.yml diverges from known consumer
 # mirrors (workspace-runtime's bundled pre-commit hook today; more
 # can be added as the consumer set grows).
 #
 # Why this exists: every side that scans for credentials has its own
 # copy of the pattern list. They drift — most recently the runtime
 # hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088),
 # so a developer's local pre-commit would let a sk-cp- token through
 # while the org-wide CI scan would refuse it. The cost of that drift
 # is dev confusion + delayed feedback; the fix is automated detection.
 #
 # Triggers:
 #   - schedule: daily 05:00 UTC. Catches drift introduced by edits
 #     to a consumer copy that didn't update canonical here.
 #   - push to main/staging where the canonical or this lint changed:
 #     catches the inverse — canonical updated but consumers not yet
 #     bumped. The lint will fail the push; that's intentional.
 on:
  schedule:
    # 05:00 UTC = 22:00 PT / 01:00 ET. Quiet hours so a failure
    # email lands when humans are starting their day, not
    # interrupting it.
    - cron: "0 5 * * *"
  push:
    branches: [main, staging]
    paths:
      - ".gitea/workflows/secret-scan.yml"
      - ".gitea/workflows/secret-pattern-drift.yml"
      - ".github/scripts/lint_secret_pattern_drift.py"
      - ".githooks/pre-commit"
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 # Auto-injected GITHUB_TOKEN scoped to read-only. The lint only does git
 # checkout + HTTPS GETs to public consumer files; no writes to anything.
 permissions:
  contents: read
 jobs:
  lint:
    name: Detect SECRET_PATTERNS drift
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
      - name: Run drift lint
        run: python3 .github/scripts/lint_secret_pattern_drift.py
--- a/.github/workflows/secret-scan.yml
+++ b/.github/workflows/secret-scan.yml
@ -7,33 +7,24 @@ name: Secret scan
 # slurping the URL from a token-embedded origin remote. We can't fix
 # upstream's clone hygiene, so we gate here.
 #
 # Also the canonical reusable workflow for the rest of the org. Other
 # Molecule-AI repos enroll with a single 3-line workflow:
 #
 #   jobs:
 #     secret-scan:
 #       uses: molecule-ai/molecule-core/.github/workflows/secret-scan.yml@staging
 #
 # Pin to @staging not @main — staging is the active default branch,
 # main lags via the staging-promotion workflow. Updates ride along
 # automatically on the next consumer workflow run.
 #
 # Same regex set as the runtime's bundled pre-commit hook
 # (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh).
 # Keep the two sides aligned when adding patterns.
 #
 # Ported from .github/workflows/secret-scan.yml so the gate actually
 # fires on Gitea Actions. Differences from the GitHub version:
 #   - drops `merge_group` event (Gitea has no merge queue)
 #   - drops `workflow_call` (no cross-repo reusable invocation on Gitea)
 #   - SELF path updated to .gitea/workflows/secret-scan.yml
 # The job name + step name are identical to the GitHub workflow so the
 # status-check context (`Secret scan / Scan diff for credential-shaped
 # strings (pull_request)`) matches branch protection on molecule-core/main.
 on:
  pull_request:
    types: [opened, synchronize, reopened]
  push:
    branches: [main, staging]
  # Required for GitHub merge queue: the queue's pre-merge CI run on
  # `gh-readonly-queue/...` refs needs this check to fire so the queue
  # gets a real result instead of stalling forever AWAITING_CHECKS.
  merge_group:
    types: [checks_requested]
  # Reusable workflow entry point for other Molecule-AI repos.
  workflow_call:
 jobs:
  scan:
@ -50,27 +41,14 @@ jobs:
        if: github.event_name == 'pull_request'
        run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
      # For merge_group events the queue's pre-merge ref is a commit on
      # `gh-readonly-queue/...` whose parent is the queue's base_sha.
      # That parent isn't part of the queue branch's shallow clone, so
      # we fetch it explicitly. Without this the diff falls through to
      # "no BASE → scan entire tree" mode and false-positives on legit
      # test fixtures (e.g. canvas/src/lib/validation/__tests__/secret-formats.test.ts).
      - name: Fetch merge_group base SHA (merge_group events only)
        if: github.event_name == 'merge_group'
        run: git fetch --depth=1 origin ${{ github.event.merge_group.base_sha }}
      - name: Refuse if credential-shaped strings appear in diff additions
        env:
          # Plumb event-specific SHAs through env so the script doesn't
          # need conditional `${{ ... }}` interpolation per event type.
          # github.event.before/after only exist on push events;
-          # merge_group has its own base_sha/head_sha; pull_request has
+          # pull_request has pull_request.base.sha / pull_request.head.sha.
          # pull_request.base.sha / pull_request.head.sha.
          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
          MG_BASE_SHA: ${{ github.event.merge_group.base_sha }}
          MG_HEAD_SHA: ${{ github.event.merge_group.head_sha }}
          PUSH_BEFORE: ${{ github.event.before }}
          PUSH_AFTER: ${{ github.event.after }}
        run: |
@ -102,10 +80,6 @@ jobs:
              BASE="$PR_BASE_SHA"
              HEAD="$PR_HEAD_SHA"
              ;;
            merge_group)
              BASE="$MG_BASE_SHA"
              HEAD="$MG_HEAD_SHA"
              ;;
            *)
              BASE="$PUSH_BEFORE"
              HEAD="$PUSH_AFTER"
@ -144,8 +118,10 @@ jobs:
          # Self-exclude: this workflow file legitimately contains the
          # pattern strings as regex literals. Without an exclude it would
-          # block its own merge.
+          # block its own merge. Both the .github/ original and this
-          SELF=".github/workflows/secret-scan.yml"
+          # .gitea/ port are excluded so a sync between them stays clean.
          SELF_GITHUB=".github/workflows/secret-scan.yml"
          SELF_GITEA=".gitea/workflows/secret-scan.yml"
          OFFENDING=""
          # `while IFS= read -r` (not `for f in $CHANGED`) so filenames
@ -155,7 +131,8 @@ jobs:
          # self-exclude + diff lookup.
          while IFS= read -r f; do
            [ -z "$f" ] && continue
-            [ "$f" = "$SELF" ] && continue
+            [ "$f" = "$SELF_GITHUB" ] && continue
            [ "$f" = "$SELF_GITEA" ] && continue
            if [ -n "$DIFF_RANGE" ]; then
              ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
            else
--- a/.gitea/workflows/sop-tier-check.yml
+++ b/.gitea/workflows/sop-tier-check.yml
@ -0,0 +1,126 @@
 # sop-tier-check — canonical Gitea Actions workflow for §SOP-6 enforcement.
 #
 # Logic lives in `.gitea/scripts/sop-tier-check.sh` (extracted 2026-05-09
 # from the previous inline-bash version). The script is the single source
 # of truth; this workflow file just sets env + invokes it.
 #
 # Copy BOTH files (`.gitea/workflows/sop-tier-check.yml` +
 # `.gitea/scripts/sop-tier-check.sh`) into any repo that wants the
 # §SOP-6 PR gate enforced. Pair with branch protection on the protected
 # branch:
 #   required_status_checks:    ["sop-tier-check / tier-check (pull_request)"]
 #   required_approving_reviews: 1
 #   approving_review_teams:    ["ceo", "managers", "engineers"]
 #
 # Tier → required-team expression (internal#189 AND-composition):
 #   tier:low    → engineers,managers,ceo        (OR: any one suffices)
 #   tier:medium → managers AND engineers AND qa???,security???  (AND: all required)
 #   tier:high   → ceo                           (OR: single team, wired for AND)
 #
 # "???" = teams not yet created in Gitea. When qa + security teams are
 # added, update TIER_EXPR["tier:medium"] in the script to remove the
 # markers. PRs already in-flight when qa/security are created continue
 # to work because their authors explicitly requested those reviews.
 #
 # Force-merge: Owners-team override remains available out-of-band via
 # the Gitea merge API; force-merge writes `incident.force_merge` to
 # `structure_events` per §Persistent structured logging gate (Phase 3).
 #
 # Environment variables:
 #   SOP_DEBUG=1          — per-API-call diagnostic lines. Default: off.
 #   SOP_LEGACY_CHECK=1   — revert to OR-gate for this run. Grace window
 #                           for PRs in-flight when AND-composition deployed.
 #                           Burn-in: remove after 2026-05-17 (7-day window).
 #
 # BURN-IN NOTE (internal#189 Phase 1): continue-on-error: true is set on
 # the tier-check job below. This prevents AND-composition from blocking
 # PRs during the 7-day burn-in. After 2026-05-17:
 #   1. Remove `continue-on-error: true` from this job block.
 #   2. Update this BURN-IN NOTE comment to mark the window closed.
 name: sop-tier-check
 # SECURITY: triggers MUST use `pull_request_target`, not `pull_request`.
 # `pull_request_target` loads the workflow definition from the BASE
 # branch (i.e. `main`), not the PR's HEAD. With `pull_request`, anyone
 # with write access to a feature branch could rewrite this file in
 # their PR to dump SOP_TIER_CHECK_TOKEN (org-read scope) to logs and
 # exfiltrate it. Verified 2026-05-09 against Gitea 1.22.6 —
 # `pull_request_target` (added in Gitea 1.21 via go-gitea/gitea#25229)
 # is the documented mitigation.
 #
 # This workflow does NOT call `actions/checkout` of PR HEAD code, so no
 # untrusted code is ever executed in the runner — we only HTTP-call the
 # Gitea API. If a future change adds a checkout step, it MUST pin to
 # `${{ github.event.pull_request.base.sha }}` (NOT `head.sha`) to keep
 # the trust boundary.
 on:
  pull_request_target:
    types: [opened, edited, synchronize, reopened, labeled, unlabeled]
  pull_request_review:
    types: [submitted, dismissed, edited]
 jobs:
  tier-check:
    runs-on: ubuntu-latest
    # BURN-IN: continue-on-error prevents AND-composition from blocking
    # PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
    continue-on-error: true
    permissions:
      contents: read
      pull-requests: read
    steps:
      - name: Check out base branch (for the script)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # Pin to base.sha — pull_request_target's protection only
          # works if we never check out PR HEAD. Same SHA the workflow
          # itself was loaded from.
          ref: ${{ github.event.pull_request.base.sha }}
      - name: Install jq
        # Gitea Actions runners (ubuntu-latest label) do not bundle jq.
        # The sop-tier-check script uses jq for all JSON API parsing.
        # Install jq before the script runs so sop-tier-check can pass.
        #
        # Method: apt-get first (reliable for Ubuntu runners with internet
        # access to package mirrors). Falls back to GitHub binary download.
        # GitHub releases may be unreachable from some runner networks
        # (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
        # runners). The sop-tier-check script has its own fallback as a
        # third line of defense. continue-on-error: true ensures this step
        # failing does not block the job.
        continue-on-error: true
        run: |
          # apt-get is the primary method — Ubuntu package mirrors are reliably
          # reachable from runner containers. GitHub releases may be blocked
          # or slow on some networks (infra#241 follow-up).
          if apt-get update -qq && apt-get install -y -qq jq; then
            echo "::notice::jq installed via apt-get: $(jq --version)"
          elif timeout 120 curl -sSL \
            "https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64" \
            -o /usr/local/bin/jq && chmod +x /usr/local/bin/jq; then
            echo "::notice::jq binary downloaded: $(/usr/local/bin/jq --version)"
          else
            echo "::warning::jq install failed — apt-get and GitHub download both failed."
          fi
          jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"
      - name: Verify tier label + reviewer team membership
        # continue-on-error: true at step level — job-level is ignored by Gitea
        # Actions (quirk #10, internal runbooks). Belt-and-suspenders with
        # SOP_FAIL_OPEN=1 + || true below.
        continue-on-error: true
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
          SOP_DEBUG: '0'
          SOP_LEGACY_CHECK: '0'
          # SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
          # the actual merge gate. Combined with continue-on-error: true
          # above, this step never fails the job regardless of script exit.
          SOP_FAIL_OPEN: '1'
        run: |
          bash .gitea/scripts/sop-tier-check.sh || true
--- a/.gitea/workflows/sop-tier-refire.yml
+++ b/.gitea/workflows/sop-tier-refire.yml
@ -0,0 +1,79 @@
 # sop-tier-refire — issue_comment-triggered refire of sop-tier-check.
 #
 # Closes internal#292. Gitea 1.22.6 doesn't refire workflows on the
 # `pull_request_review` event (go-gitea/gitea#33700); the `sop-tier-check`
 # workflow's review-event subscription is silently dead. The result:
 # PRs that get their approving review AFTER the tier-check ran on open/
 # synchronize keep their failing status check forever, and the only way
 # to merge is the admin force-merge path (audited via `audit-force-merge`
 # but the audit trail keeps growing; see `feedback_never_admin_merge_bypass`).
 #
 # Workaround pattern from `feedback_pull_request_review_no_refire`:
 # `issue_comment` events DO fire reliably on 1.22.6. When a repo
 # MEMBER/OWNER/COLLABORATOR comments `/refire-tier-check` on a PR, this
 # workflow re-runs the sop-tier-check logic and POSTs the resulting
 # status to the PR head SHA directly. No empty commit, no git history
 # bloat, no cascade re-fire of every other workflow on the PR.
 #
 # SECURITY MODEL:
 #
 # 1. `pull_request` exists on the issue (issue_comment fires on issues
 #    AND PRs; we only want PRs).
 # 2. `comment.author_association` must be MEMBER/OWNER/COLLABORATOR.
 #    Per the internal#292 core-security review (review#1066 ask): anyone
 #    can comment, but only repo collaborators+ can flip the status.
 #    Without this gate, a drive-by commenter on a public-issue-tracker
 #    surface could trigger a status flip.
 # 3. Comment body must contain `/refire-tier-check` — a slash-command-
 #    shaped trigger (not just any comment word). Prevents accidental
 #    triggering from prose like "we should refire tests" in a review.
 # 4. This workflow does NOT check out PR HEAD code. Like sop-tier-check,
 #    it only HTTP-calls the Gitea API. Trust boundary preserved.
 #
 # Note: `issue_comment` fires from the BASE branch's workflow file. There
 # is no `pull_request_target` equivalent to set; the trigger inherently
 # loads the workflow from the default branch.
 #
 # Rate-limit: a 1s pre-sleep + a "skip if status posted in last 30s"
 # guard prevents comment-spam from thrashing the status. See the script.
 name: sop-tier-check refire (issue_comment)
 on:
  issue_comment:
    types: [created]
 jobs:
  refire:
    # Three gates, all required:
    #   - comment is on a PR (not a plain issue)
    #   - commenter is MEMBER, OWNER, or COLLABORATOR
    #   - comment body contains the slash-command trigger
    if: |
      github.event.issue.pull_request != null &&
      contains(fromJson('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association) &&
      contains(github.event.comment.body, '/refire-tier-check')
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: read
      statuses: write
    steps:
      - name: Check out base branch (for the script)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          # Load the script from the default branch (main), matching the
          # sop-tier-check.yml security model.
          ref: ${{ github.event.repository.default_branch }}
      - name: Re-evaluate sop-tier-check and POST status
        env:
          # Same org-level secret sop-tier-check.yml + audit-force-merge.yml use.
          # Fallback to GITHUB_TOKEN with a clear error if missing.
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.issue.number }}
          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
          # Set to '1' for diagnostic per-API-call output. Off by default.
          SOP_DEBUG: '0'
        run: bash .gitea/scripts/sop-tier-refire.sh
--- a/.gitea/workflows/staging-smoke.yml
+++ b/.gitea/workflows/staging-smoke.yml
@ -0,0 +1,346 @@
 name: Staging SaaS smoke (every 30 min)
 # Renamed from canary-staging.yml on 2026-05-11 per Hongming directive
 # ("canary naming changed to staging for all"). Originally ported from
 # .github/workflows/canary-staging.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Minimum viable health check: provisions one Hermes workspace on a fresh
 # staging org, sends one A2A message, verifies PONG, tears down. ~8 min
 # wall clock. Pages on failure by opening a GitHub issue; auto-closes the
 # issue on the next green run.
 #
 # The full-SaaS workflow (e2e-staging-saas.yml) covers the broader surface
 # but runs only on provisioning-critical pushes + nightly — this one
 # catches drift in the 30-min window between those runs (AMI health, CF
 # cert rotation, WorkOS session stability, etc.).
 #
 # Lean mode: E2E_MODE=smoke skips the child workspace + HMA memory +
 # peers/activity checks. One parent workspace + one A2A turn is enough
 # to signal "SaaS stack end-to-end is alive."
 on:
  schedule:
    # Every 30 min. Cron on GitHub-hosted runners has a known drift of
    # a few minutes under load — that's fine for a smoke check.
    - cron: '*/30 * * * *'
 # Serialise with the full-SaaS workflow so they don't contend for the
 # same org-create quota on staging. Different group key from
 # e2e-staging-saas since we don't mind queueing smoke runs behind one
 # full run, but two smoke runs SHOULD queue against each other.
 concurrency:
  group: staging-smoke
  cancel-in-progress: false
 permissions:
  # Needed to open / close the alerting issue.
  issues: write
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  smoke:
    name: Staging SaaS smoke
    runs-on: ubuntu-latest
    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
    # 2026-05-11. The "surface broken workflows without blocking"
    # rationale was correctly applied to advisory/lint workflows but
    # wrong for this smoke — it is the 30-min canary cadence for the
    # entire staging SaaS stack, and silent failure here masks the
    # exact regressions the smoke exists to surface (AMI rot, CF cert
    # drift, WorkOS session breakage, secret rotations). Same class of
    # failure as PR#461 (`sweep-stale-e2e-orgs`) where Phase-3 silent
    # failure leaked EC2. The four other `e2e-staging-*` workflows
    # KEEP `continue-on-error: true` per RFC #219 §1 — they are
    # advisory and matrix-style; this one is the canary. A follow-up
    # `notify-failure` step below also surfaces breakage to ops even
    # if branch-protection wiring is adjusted to keep this off the
    # required-checks list.
    # 25 min headroom over the 15-min TLS-readiness deadline in
    # tests/e2e/test_staging_full_saas.sh (#2107). Without the buffer
    # the job is killed at the wall-clock 15:00 mark BEFORE the bash
    # `fail` + diagnostic burst can fire, leaving every cancellation
    # silent. Sibling staging E2E jobs run at 20-45 min — keeping the
    # smoke tighter than them so a true wedge still surfaces here
    # first.
    timeout-minutes: 25
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      # 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
      # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
      # internal#322 — see this PR for the cross-workflow sweep.
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # MiniMax is the smoke's PRIMARY LLM auth path post-2026-05-04.
      # Switched from hermes+OpenAI after #2578 (the staging OpenAI key
      # account went over quota and stayed dead for 36+ hours, taking
      # the smoke red the entire time). claude-code template's
      # `minimax` provider routes ANTHROPIC_BASE_URL to
      # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot —
      # ~5-10x cheaper per token than gpt-4.1-mini AND on a separate
      # billing account, so OpenAI quota collapse no longer wedges the
      # smoke. Mirrors the migration continuous-synth-e2e.yml made on
      # 2026-05-03 (#265) for the same reason. tests/e2e/test_staging_
      # full_saas.sh branches SECRETS_JSON on which key is present —
      # MiniMax wins when set.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      # Direct-Anthropic alternative for operators who don't want to
      # set up a MiniMax account (priority below MiniMax — first
      # non-empty wins in test_staging_full_saas.sh's secrets-injection
      # block). See #2578 PR comment for the rationale.
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      # OpenAI fallback — kept wired so an operator-dispatched run with
      # E2E_RUNTIME=hermes overridden via workflow_dispatch can still
      # exercise the OpenAI path without re-editing the workflow.
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
      E2E_MODE: smoke
      E2E_RUNTIME: claude-code
      # Pin the smoke to a specific MiniMax model rather than relying
      # on the per-runtime default (which could resolve to "sonnet" →
      # direct Anthropic and defeat the cost saving). M2.7-highspeed
      # is "Token Plan only" but cheap-per-token and fast.
      E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
      E2E_RUN_ID: "smoke-${{ github.run_id }}"
      # Debug-only: when an operator dispatches with keep_on_failure=true,
      # the smoke script's E2E_KEEP_ORG=1 path skips teardown so the
      # tenant org + EC2 stay alive for SSM-based log capture. Cron runs
      # never set this (the input only exists on workflow_dispatch) so
      # unattended cron always tears down. See molecule-core#129
      # failure mode #1 — capturing the actual exception requires
      # docker logs from the live container.
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
            exit 2
          fi
      - name: Verify LLM key present
        run: |
          # Per-runtime key check — claude-code uses MiniMax; hermes /
          # langgraph (operator-dispatched only) use OpenAI. Hard-fail
          # rather than soft-skip per the lesson from synth E2E #2578:
          # an empty key silently falls through to the wrong
          # SECRETS_JSON branch and the smoke fails 5 min later with
          # a confusing auth error instead of the clean "secret
          # missing" message at the top.
          case "${E2E_RUNTIME}" in
            claude-code)
              # Either MiniMax OR direct-Anthropic works — first
              # non-empty wins in the test script's secrets-injection
              # priority chain. Operators only need to set ONE of these
              # secrets; we don't force a choice between them.
              if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
                required_secret_value="${E2E_MINIMAX_API_KEY}"
              elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
                required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value="${E2E_ANTHROPIC_API_KEY}"
              else
                required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
                required_secret_value=""
              fi
              ;;
            langgraph|hermes)
              required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
              required_secret_value="${E2E_OPENAI_API_KEY:-}"
              ;;
            *)
              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
              required_secret_name=""
              required_secret_value="present"
              ;;
          esac
          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
            echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — A2A will fail at request time with 'No LLM provider configured'"
            exit 2
          fi
          echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
      - name: Smoke run
        id: smoke
        run: bash tests/e2e/test_staging_full_saas.sh
      # Alerting: open a sticky issue on the FIRST failure; comment on
      # subsequent failures; auto-close on next green. Comment-on-existing
      # de-duplicates so a single open issue accumulates the streak —
      # ops sees one issue with N comments rather than N issues.
      #
      # Why no consecutive-failures threshold (e.g., wait 3 runs before
      # filing): the prior threshold check used
      # `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does
      # not expose (returns 404). On Gitea Actions the threshold call
      # ALWAYS failed, breaking the entire alerting step and going days
      # silent on real regressions (38h+ chronic red on 2026-05-07/08
      # before this fix; tracked in molecule-core#129). Filing on first
      # failure is also better UX — we want to know about the first red,
      # not wait 90 min for it to "count." Real flakes get one issue +
      # a quick close-on-green; persistent reds accumulate comments.
      - name: Open issue on failure (Gitea API)
        if: failure()
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          # Title kept stable across the canary-staging.yml → staging-smoke.yml
          # rename (2026-05-11) so any open alert issue from the old name
          # still title-matches and auto-closes on the next green run.
          TITLE="Canary failing: staging SaaS smoke"
          RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
          EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
          if [ -n "$EXISTING" ]; then
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${EXISTING}/comments" \
              -d "$(jq -nc --arg run "$RUN_URL" '{body: ("Smoke still failing. " + $run)}')" >/dev/null
            echo "Commented on existing issue #${EXISTING}"
          else
            NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
            BODY=$(jq -nc --arg t "$TITLE" --arg now "$NOW" --arg run "$RUN_URL" \
              '{title: $t, body: ("Smoke run failed at " + $now + ".\n\nRun: " + $run + "\n\nThis issue auto-closes on the next green smoke run. Consecutive failures add a comment here rather than a new issue.")}')
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues" -d "$BODY" >/dev/null
            echo "Opened smoke failure issue (first red)"
          fi
      - name: Auto-close smoke issue on success (Gitea API)
        if: success()
        env:
          GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set -euo pipefail
          API="${SERVER_URL%/}/api/v1"
          # Title kept stable across the canary-staging.yml → staging-smoke.yml
          # rename so open alert issues from the old name still match.
          TITLE="Canary failing: staging SaaS smoke"
          NUMS=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
            "${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
            | jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number')
          NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
          for N in $NUMS; do
            curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}/comments" \
              -d "$(jq -nc --arg now "$NOW" '{body: ("Smoke recovered at " + $now + ". Closing.")}')" >/dev/null
            curl -fsS -X PATCH -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
              "${API}/repos/${REPO}/issues/${N}" -d '{"state":"closed"}' >/dev/null
            echo "Closed recovered smoke issue #${N}"
          done
      - name: Teardown safety net
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          # Slug prefix matches what test_staging_full_saas.sh emits
          # in smoke mode:
          #   SLUG="e2e-smoke-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
          # Earlier (pre-2026-05-11 canary→staging rename) the prefix was
          # `e2e-canary-`; both prefixes are matched here for one
          # release cycle so cleanup still catches any in-flight org
          # provisioned under the old prefix on an older runner that
          # hasn't picked up the renamed script. Remove the canary
          # fallback after one week of no-old-prefix observations.
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys, os, datetime
          run_id = os.environ.get('GITHUB_RUN_ID', '')
          d = json.load(sys.stdin)
          # Scope to slugs from THIS smoke run when GITHUB_RUN_ID is
          # available; the smoke workflow sets E2E_RUN_ID='smoke-\${run_id}'
          # so the slug suffix is '-smoke-\${run_id}-...'. Mirrors the
          # full-mode safety net's per-run scoping (e2e-staging-saas.yml)
          # added after the 2026-04-21 cross-run cleanup incident.
          # Sweep both today AND yesterday's UTC dates so a run that
          # crosses midnight still cleans up its own slug — see the
          # 2026-04-26→27 canvas-safety-net incident.
          today = datetime.date.today()
          yesterday = today - datetime.timedelta(days=1)
          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
          if run_id:
              prefixes = tuple(f'e2e-smoke-{d}-smoke-{run_id}' for d in dates) \
                       + tuple(f'e2e-canary-{d}-canary-{run_id}' for d in dates)
          else:
              prefixes = tuple(f'e2e-smoke-{d}-' for d in dates) \
                       + tuple(f'e2e-canary-{d}-' for d in dates)
          candidates = [o['slug'] for o in d.get('orgs', [])
                        if any(o.get('slug','').startswith(p) for p in prefixes)
                        and o.get('status') not in ('purged',)]
          print('\n'.join(candidates))
          " 2>/dev/null)
          # Per-slug DELETE with HTTP-code verification. The previous
          # `... >/dev/null || true` swallowed every failure, so a 5xx
          # or timeout from CP looked identical to "successfully cleaned
          # up" and the tenant kept eating ~2 vCPU until the hourly
          # stale sweep caught it (up to 2h later). Now we capture the
          # response code and surface non-2xx as a workflow warning, so
          # the run page shows which slug leaked. We still don't `exit 1`
          # on cleanup failure — a single-smoke cleanup miss shouldn't
          # fail-flag the smoke itself when the actual smoke check
          # passed. The sweep-stale-e2e-orgs cron (now every 15 min,
          # 30-min threshold) is the safety net for whatever slips past.
          # See molecule-controlplane#420.
          leaks=()
          for slug in $orgs; do
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/smoke-cleanup.out -w "%{http_code}" \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/smoke-cleanup.code
            set -e
            code=$(cat /tmp/smoke-cleanup.code 2>/dev/null || echo "000")
            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
              echo "[teardown] deleted $slug (HTTP $code)"
            else
              echo "::warning::smoke teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/smoke-cleanup.out 2>/dev/null)"
              leaks+=("$slug")
            fi
          done
          if [ ${#leaks[@]} -gt 0 ]; then
            echo "::warning::smoke teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
          fi
          exit 0
      - name: Notify on smoke failure
        # Fail-loud companion to dropping `continue-on-error: true`.
        # The Open-issue-on-failure step above handles the human-facing
        # alert; this step emits a clearly-tagged ::error:: line that
        # log-tail consumers (Loki SOPRefireRule, orchestrator triage
        # loop) can grep on. Mirrors PR#461's sweep-stale-e2e-orgs
        # pattern. Runs AFTER the teardown safety net (which is
        # if: always()) so failures don't suppress cleanup.
        if: failure()
        run: |
          echo "::error::staging-smoke FAILED — staging SaaS canary is red. See prior step logs + the auto-filed alert issue. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) MiniMax/Anthropic LLM key dead, (d) AMI/CF/WorkOS drift. The 30-min cron will retry, but a chronic red here indicates the staging SaaS stack is broken end-to-end."
          exit 1
--- a/.gitea/workflows/staging-verify.yml
+++ b/.gitea/workflows/staging-verify.yml
@ -0,0 +1,288 @@
 name: Staging verify
 # Renamed from canary-verify.yml on 2026-05-11 per Hongming directive
 # ("canary naming changed to staging for all"). Originally ported from
 # .github/workflows/canary-verify.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #   - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support
 #     for the `workflow_run` event is partial. If this never fires on a
 #     real publish-workspace-server-image completion, the follow-up
 #     triage PR should replace the trigger with a push-with-paths-filter
 #     on the same publish workflow's path (i.e. `.gitea/workflows/publish-workspace-server-image.yml`).
 #
 # Runs the canary smoke suite against the staging canary tenant fleet
 # after a new :staging-<sha> image lands in ECR. On green, calls the
 # CP redeploy-fleet endpoint to promote :staging-<sha> → :latest so
 # the prod tenant fleet's 5-minute auto-updater picks up the verified
 # digest. On red, :latest stays on the prior known-good digest and
 # prod is untouched.
 #
 # Terminology note (2026-05-11): The deployment STRATEGY here is still
 # called "canary release" (a small subset of tenants gets the new image
 # first, the rest follow on green). The "canary" word stays for the
 # pre-fan-out cohort concept (see docs/architecture/canary-release.md
 # and CANARY_SLUG in redeploy-tenants-on-*.yml). What changed is the
 # FILE NAME and the SECRETS feeding this workflow — both are renamed
 # to drop the redundant "canary-" prefix that conflated workflow
 # identity with deployment strategy.
 #
 # Registry note (2026-05-10): This workflow previously used GHCR
 # (ghcr.io/molecule-ai/platform-tenant) — that registry was retired
 # during the 2026-05-06 Gitea suspension migration when publish-
 # workspace-server-image.yml switched to the operator's ECR org
 # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/
 # platform-tenant). The GHCR → ECR migration was never applied to
 # this file, so this workflow was silently smoke-testing the stale
 # GHCR image while the actual staging/prod tenants ran the ECR image.
 # Result: smoke tests could not catch a broken ECR build. Fix:
 #   - Wait step: reads SHA from running canary /health (tenant-
 #     agnostic, works regardless of registry).
 #   - Promote step: calls CP redeploy-fleet endpoint with target_tag=
 #     staging-<sha>, same mechanism as redeploy-tenants-on-main.yml.
 #     No longer attempts GHCR crane ops.
 #
 # Dependencies:
 #   - publish-workspace-server-image.yml publishes :staging-<sha>
 #     to ECR on staging and main merges.
 #   - Canary tenants are configured to pull :staging-<sha> from ECR
 #     (TENANT_IMAGE env set to the ECR :staging-<sha> tag).
 #   - Repo secrets MOLECULE_STAGING_TENANT_URLS /
 #     MOLECULE_STAGING_ADMIN_TOKENS / MOLECULE_STAGING_CP_SHARED_SECRET
 #     are populated.
 on:
  workflow_run:
    workflows: ["publish-workspace-server-image"]
    types: [completed]
 permissions:
  contents: read
  packages: write
  actions: read
 env:
  # ECR registry (post-2026-05-06 SSOT for tenant images).
  # publish-workspace-server-image.yml pushes here.
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
  # CP endpoint for redeploy-fleet (used in promote step below).
  CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  staging-smoke:
    # Skip when the upstream workflow failed — no image to test against.
    # workflow_dispatch trigger dropped in this Gitea port; only the
    # workflow_run path remains.
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    outputs:
      sha: ${{ steps.compute.outputs.sha }}
      smoke_ran: ${{ steps.smoke.outputs.ran }}
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Compute sha
        id: compute
        run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      - name: Wait for canary tenants to pick up :staging-<sha>
        # Poll canary health endpoints every 30s for up to 7 min instead
        # of a fixed 6-min sleep. Exits as soon as ALL canaries report
        # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to
        # proceeding after 7 min even if not all canaries responded —
        # the smoke suite will catch any that didn't update.
        #
        # NOTE: The SHA is read from the running tenant's /health response,
        # NOT from a registry lookup. This is registry-agnostic and works
        # regardless of whether the tenant pulls from ECR, GHCR, or any
        # other registry — the canary is telling us what it's actually
        # running, which is the ground truth for smoke testing.
        env:
          MOLECULE_STAGING_TENANT_URLS: ${{ secrets.MOLECULE_STAGING_TENANT_URLS }}
          EXPECTED_SHA: ${{ steps.compute.outputs.sha }}
        run: |
          if [ -z "$MOLECULE_STAGING_TENANT_URLS" ]; then
            echo "No canary URLs configured — falling back to 60s wait"
            sleep 60
            exit 0
          fi
          IFS=',' read -ra URLS <<< "$MOLECULE_STAGING_TENANT_URLS"
          MAX_WAIT=420  # 7 minutes
          INTERVAL=30
          ELAPSED=0
          while [ $ELAPSED -lt $MAX_WAIT ]; do
            ALL_READY=true
            for url in "${URLS[@]}"; do
              HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}")
              SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4)
              if [ "$SHA" != "$EXPECTED_SHA" ]; then
                ALL_READY=false
                break
              fi
            done
            if $ALL_READY; then
              echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s"
              exit 0
            fi
            echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)"
            sleep $INTERVAL
            ELAPSED=$((ELAPSED + INTERVAL))
          done
          echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)"
      - name: Run staging smoke suite
        id: smoke
        # Graceful-skip when no canary fleet is configured (Phase 2 not yet
        # stood up — see molecule-controlplane/docs/canary-tenants.md).
        # Sets `ran=false` on skip so promote-to-latest stays off (we don't
        # want every main merge auto-promoting without gating). Manual
        # promote-latest.yml is the release gate while canary is absent.
        # Once the fleet is real: delete the early-exit branch.
        env:
          MOLECULE_STAGING_TENANT_URLS: ${{ secrets.MOLECULE_STAGING_TENANT_URLS }}
          MOLECULE_STAGING_ADMIN_TOKENS: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKENS }}
          MOLECULE_STAGING_CP_BASE_URL: https://staging-api.moleculesai.app
          MOLECULE_STAGING_CP_SHARED_SECRET: ${{ secrets.MOLECULE_STAGING_CP_SHARED_SECRET }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_STAGING_TENANT_URLS:-}" ] \
            || [ -z "${MOLECULE_STAGING_ADMIN_TOKENS:-}" ] \
            || [ -z "${MOLECULE_STAGING_CP_SHARED_SECRET:-}" ]; then
            {
              echo "## ⚠️ staging-verify skipped"
              echo
              echo "One or more canary secrets are unset (\`MOLECULE_STAGING_TENANT_URLS\`, \`MOLECULE_STAGING_ADMIN_TOKENS\`, \`MOLECULE_STAGING_CP_SHARED_SECRET\`)."
              echo "Phase 2 canary fleet has not been stood up yet —"
              echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
              echo
              echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "ran=false" >> "$GITHUB_OUTPUT"
            echo "::notice::staging-verify: skipped — no canary fleet configured"
            exit 0
          fi
          bash scripts/staging-smoke.sh
          echo "ran=true" >> "$GITHUB_OUTPUT"
      - name: Summary on failure
        if: ${{ failure() }}
        run: |
          {
            echo "## Canary smoke FAILED"
            echo
            echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`."
            echo ":latest stays pinned to the prior good digest — prod is untouched."
            echo
            echo "Fix forward and merge again, or investigate the specific failed"
            echo "assertions in the staging-smoke step log above."
          } >> "$GITHUB_STEP_SUMMARY"
  promote-to-latest:
    # On green, calls the CP redeploy-fleet endpoint with target_tag=
    # staging-<sha> to promote the verified ECR image. This is the same
    # mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops.
    #
    # Pre-fix history: the old GHCR promote step used `crane tag` against
    # ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server-
    # image.yml had already migrated to ECR on 2026-05-07 (commit
    # 10e510f5). The GHCR tags were never updated, so this step was
    # silently promoting a stale GHCR image while actual prod tenants
    # pulled from ECR. Canary smoke tests were GHCR-targeted and could
    # not catch a broken ECR build.
    needs: staging-smoke
    if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    env:
      SHA: ${{ needs.staging-smoke.outputs.sha }}
      CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
      # CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint.
      # Stored at the repo level so all workflows pick it up automatically.
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      # canary_slug pin: deploy the verified :staging-<sha> to the canary
      # first (soak 120s), then fan out to the rest of the fleet.
      CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }}
      SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }}
      BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }}
    steps:
      - name: Check CP credentials
        run: |
          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
            echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet."
            echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret."
            exit 1
          fi
      - name: Promote verified ECR image to :latest
        run: |
          set -euo pipefail
          TARGET_TAG="staging-${SHA}"
          BODY=$(jq -nc \
            --arg tag "$TARGET_TAG" \
            --argjson soak "${SOAK_SECONDS:-120}" \
            --argjson batch "${BATCH_SIZE:-3}" \
            --argjson dry false \
            '{
              target_tag: $tag,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          if [ -n "${CANARY_SLUG:-}" ]; then
            BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY")
          fi
          echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  target_tag: $TARGET_TAG"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          CURL_EXIT=$?
          set -e
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          if [ "$HTTP_CODE" -ge 400 ]; then
            echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed."
            exit 1
          fi
      - name: Summary
        run: |
          {
            echo "## Staging verified — :latest promoted via CP redeploy-fleet"
            echo ""
            echo "- **Target tag:** \`staging-${{ needs.staging-smoke.outputs.sha }}\`"
            echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)"
            echo "- **Canary slug:** \`${CANARY_SLUG:-<none>}\` (soak ${SOAK_SECONDS}s)"
            echo "- **Batch size:** ${BATCH_SIZE:-3}"
            echo ""
            echo "CP redeploy-fleet is rolling out the verified image across the prod fleet."
            echo "The fleet's 5-minute health-check loop will pick up the update automatically."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.gitea/workflows/sweep-aws-secrets.yml
+++ b/.gitea/workflows/sweep-aws-secrets.yml
@ -0,0 +1,129 @@
 name: Sweep stale AWS Secrets Manager secrets
 # Ported from .github/workflows/sweep-aws-secrets.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for per-tenant AWS Secrets Manager secrets
 # (`molecule/tenant/<org_id>/bootstrap`) whose backing tenant no
 # longer exists. Parallel-shape to sweep-cf-tunnels.yml and
 # sweep-cf-orphans.yml — different cloud, same justification.
 #
 # Why this exists separately from a long-term reconciler integration:
 #   - molecule-controlplane's tenant_resources audit table (mig 024)
 #     currently tracks four resource kinds: CloudflareTunnel,
 #     CloudflareDNS, EC2Instance, SecurityGroup. SecretsManager is
 #     not in the list, so the existing reconciler doesn't catch
 #     orphan secrets.
 #   - At ~$0.40/secret/month the cost grew to ~$19/month before this
 #     sweeper was written, indicating ~45+ orphan secrets from
 #     crashed provisions and incomplete deprovision flows.
 #   - The proper fix (KindSecretsManagerSecret + recorder hook +
 #     reconciler enumerator) is filed as a separate controlplane
 #     issue. This sweeper is the immediate cost-relief stopgap.
 #
 # IAM principal: AWS_JANITOR_ACCESS_KEY_ID / AWS_JANITOR_SECRET_ACCESS_KEY.
 # This is a DEDICATED principal — the production `molecule-cp` IAM
 # user lacks `secretsmanager:ListSecrets` (it only has
 # Get/Create/Update/Delete on specific resources, scoped to its
 # operational needs). The janitor needs ListSecrets across the
 # `molecule/tenant/*` prefix, which warrants a separate principal so
 # we don't broaden the prod-CP policy.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
 # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
 # the mostly-orphan tunnels) refuses to nuke past the threshold.
 on:
  schedule:
    # Hourly at :30 — offsets from sweep-cf-orphans (:15) and
    # sweep-cf-tunnels (:45) so the three janitors don't burst the
    # CP admin endpoints at the same minute.
    - cron: '30 * * * *'
 # Don't let two sweeps race the same AWS account.
 concurrency:
  group: sweep-aws-secrets
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep AWS Secrets Manager
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # 30 min cap, mirroring the other janitors. AWS DeleteSecret is
    # fast (~0.3s/call) so even a 100+ backlog drains in seconds
    # under the 8-way xargs parallelism, but the cap is set generously
    # to leave headroom for any actual API hang.
    timeout-minutes: 30
    env:
      AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_JANITOR_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_JANITOR_SECRET_ACCESS_KEY }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
      GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        id: verify
        # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
        # and sweep-cf-tunnels (hardened 2026-04-28). Same principle:
        #   - schedule → exit 1 on missing secrets (red CI surfaces it)
        #   - workflow_dispatch → exit 0 with warning (operator-driven,
        #     they already accepted the repo state)
        run: |
          missing=()
          for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do
            if [ -z "${!var:-}" ]; then
              missing+=("$var")
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
              echo "::warning::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/* (the prod molecule-cp principal lacks ListSecrets)."
              echo "skip=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
            echo "::error::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/*."
            exit 1
          fi
          echo "All required secrets present ✓"
          echo "skip=false" >> "$GITHUB_OUTPUT"
      - name: Run sweep
        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-tunnels:
        #   - Scheduled: input empty → "false" → --execute (the whole
        #     point of an hourly janitor).
        #   - Manual workflow_dispatch: input default true → dry-run;
        #     operator must flip it to actually delete.
        run: |
          set -euo pipefail
          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
            echo "Running in dry-run mode — no deletions"
            bash scripts/ops/sweep-aws-secrets.sh
          else
            echo "Running with --execute — will delete identified orphans"
            bash scripts/ops/sweep-aws-secrets.sh --execute
          fi
--- a/.gitea/workflows/sweep-cf-orphans.yml
+++ b/.gitea/workflows/sweep-cf-orphans.yml
@ -0,0 +1,151 @@
 name: Sweep stale Cloudflare DNS records
 # Ported from .github/workflows/sweep-cf-orphans.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for Cloudflare DNS records whose backing tenant/workspace no
 # longer exists. Without this loop, every short-lived E2E or canary
 # leaves a CF record on the moleculesai.app zone — the zone has a
 # 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
 # start failing with code 81045 once exhausted.
 #
 # Why a separate workflow vs sweep-stale-e2e-orgs.yml:
 #   - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
 #     drives the cascade). It assumes CP has the org row to drive the
 #     deprovision from. It doesn't catch records left behind when CP
 #     itself never knew about the tenant (canary scratch, manual ops
 #     experiments) or when the cascade's CF-delete branch failed.
 #   - sweep-cf-orphans.sh enumerates the CF zone directly and matches
 #     each record against live CP slugs + AWS EC2 names. It catches
 #     leaks the CP-driven sweep can't.
 #
 # Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
 # than 50% of records in a single run. If something has gone weird
 # (CP admin endpoint returns no orgs → every tenant looks orphan) the
 # gate halts before damage. Decision-function unit tests in
 # scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
 # classifier.
 on:
  schedule:
    # Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
    # converge on the same tick. CF API rate budget is generous (1200
    # req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
    - cron: '15 * * * *'  # offset from sweep-stale-e2e-orgs (top of hour)
  # No `merge_group:` trigger on purpose. This is a janitor — it doesn't
  # need to gate merges, and including it as written before #2088 fired
  # the full sweep job (or its secret-check) on every PR going through
  # the merge queue, generating one red CI run per merge-queue eval. If
  # this workflow is ever wired up as a required check, re-add
  #   merge_group: { types: [checks_requested] }
  # AND gate the sweep step with `if: github.event_name != 'merge_group'`
  # so merge-queue evals report success without actually running.
 # Don't let two sweeps race the same zone. workflow_dispatch during a
 # scheduled run would otherwise issue duplicate DELETE calls.
 concurrency:
  group: sweep-cf-orphans
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep CF orphans
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
    # within one cron interval instead of burning a full tick. Realistic
    # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
    # each individually capped at 10s by the script's curl -m flag.
    timeout-minutes: 3
    env:
      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
      CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      AWS_DEFAULT_REGION: us-east-2
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        id: verify
        # Schedule-vs-dispatch behaviour split (hardened 2026-04-28
        # after the silent-no-op incident below):
        #
        # The earlier soft-skip-on-schedule policy hid a real leak. All
        # six secrets were unset on this repo for an unknown duration;
        # every hourly run printed a yellow ::warning:: and exited 0,
        # so the workflow registered as "passing" while doing nothing.
        # CF orphans accumulated to 152/200 (~76% of the zone quota
        # gone) before a manual `dig`-driven audit caught it. Anything
        # that runs as a janitor and reports green while idle is
        # indistinguishable from "the janitor is healthy" — so we now
        # treat schedule (and any future workflow_run/push triggers)
        # as a hard-fail when secrets are missing.
        #
        #   - schedule / workflow_run / push → exit 1 (red CI run
        #     surfaces the misconfiguration the next tick)
        #   - workflow_dispatch              → exit 0 with a warning
        #     (an operator ran this ad-hoc; they already accepted the
        #     state of the repo and want the workflow to short-circuit
        #     so they can rerun after fixing the secret)
        run: |
          missing=()
          for var in CF_API_TOKEN CF_ZONE_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
            if [ -z "${!var:-}" ]; then
              missing+=("$var")
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
              echo "skip=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
            echo "::error::a silent skip masked an active CF DNS leak (152/200 zone records) caught only by a manual audit on 2026-04-28; this gate exists to make the gap visible."
            exit 1
          fi
          echo "All required secrets present ✓"
          echo "skip=false" >> "$GITHUB_OUTPUT"
      - name: Run sweep
        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry (intentional):
        #   - Scheduled runs: github.event.inputs.dry_run is empty →
        #     defaults to "false" below → script runs with --execute
        #     (the whole point of an hourly janitor).
        #   - Manual workflow_dispatch: input default is true (line 38)
        #     so an ad-hoc operator-triggered run is dry-run by default;
        #     they have to flip the toggle to actually delete.
        # The script's MAX_DELETE_PCT gate (default 50%) is the second
        # line of defense regardless of mode.
        run: |
          set -euo pipefail
          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
            echo "Running in dry-run mode — no deletions"
            bash scripts/ops/sweep-cf-orphans.sh
          else
            echo "Running with --execute — will delete identified orphans"
            bash scripts/ops/sweep-cf-orphans.sh --execute
          fi
--- a/.gitea/workflows/sweep-cf-tunnels.yml
+++ b/.gitea/workflows/sweep-cf-tunnels.yml
@ -0,0 +1,128 @@
 name: Sweep stale Cloudflare Tunnels
 # Ported from .github/workflows/sweep-cf-tunnels.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for Cloudflare Tunnels whose backing tenant no longer
 # exists. Parallel-shape to sweep-cf-orphans.yml (which sweeps DNS
 # records); same justification, different CF resource.
 #
 # Why this exists separately from sweep-cf-orphans:
 #   - DNS records live on the zone (`/zones/<id>/dns_records`).
 #   - Tunnels live on the account (`/accounts/<id>/cfd_tunnel`).
 #   - Different CF API surface, different scopes; the existing CF
 #     token might not have `account:cloudflare_tunnel:edit`. Splitting
 #     the workflows keeps each one's secret-presence gate independent
 #     so neither silent-skips when the other's secret is missing.
 #   - Cleaner blast radius — operators can disable one without the
 #     other if a regression surfaces.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 90% — higher than
 # the DNS sweep's 50% because tenant-shaped tunnels are mostly
 # orphans by design) refuses to nuke past the threshold.
 on:
  schedule:
    # Hourly at :45 — offset from sweep-cf-orphans (:15) so the two
    # janitors don't issue parallel CF API bursts at the same minute.
    - cron: '45 * * * *'
 # Don't let two sweeps race the same account.
 concurrency:
  group: sweep-cf-tunnels
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep CF tunnels
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    # 30 min cap. Was 5 min on the theory that the only thing that
    # could take >5min is a CF-API hang — but on 2026-05-02 a backlog
    # of 672 stale tunnels accumulated (large staging E2E run + delayed
    # sweep) and the serial `curl -X DELETE` loop (~0.7s/tunnel) needed
    # ~7-8min to drain. The 5-min cap killed the run mid-sweep
    # (cancelled at 424/672, see run 25248788312); a manual rerun
    # finished the remainder fine.
    #
    # The fix is two-part: parallelize the delete loop (8-way xargs in
    # the script — see scripts/ops/sweep-cf-tunnels.sh), AND raise the
    # cap so a one-off backlog doesn't trip a hangs-detector that
    # turned out to be a real-job-too-slow detector. With 8-way
    # parallelism, 600+ tunnels drains in ~60s; 30 min is generous
    # headroom for actual hangs to still surface (and is in line with
    # the sweep-cf-orphans companion job).
    timeout-minutes: 30
    env:
      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
      CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify required secrets present
        id: verify
        # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
        # (hardened 2026-04-28 after the silent-no-op incident: the
        # janitor reported green while doing nothing because secrets
        # were unset, masking a 152/200 zone-record leak). Same
        # principle applies here:
        #   - schedule → exit 1 on missing secrets (red CI surfaces it)
        #   - workflow_dispatch → exit 0 with warning (operator-driven,
        #     they already accepted the repo state)
        run: |
          missing=()
          for var in CF_API_TOKEN CF_ACCOUNT_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do
            if [ -z "${!var:-}" ]; then
              missing+=("$var")
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
              echo "::warning::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from the zone:dns:edit scope used by sweep-cf-orphans)."
              echo "skip=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
            echo "::error::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope."
            exit 1
          fi
          echo "All required secrets present ✓"
          echo "skip=false" >> "$GITHUB_OUTPUT"
      - name: Run sweep
        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-orphans:
        #   - Scheduled: input empty → "false" → --execute (the whole
        #     point of an hourly janitor).
        #   - Manual workflow_dispatch: input default true → dry-run;
        #     operator must flip it to actually delete.
        run: |
          set -euo pipefail
          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
            echo "Running in dry-run mode — no deletions"
            bash scripts/ops/sweep-cf-tunnels.sh
          else
            echo "Running with --execute — will delete identified orphans"
            bash scripts/ops/sweep-cf-tunnels.sh --execute
          fi
--- a/.gitea/workflows/sweep-stale-e2e-orgs.yml
+++ b/.gitea/workflows/sweep-stale-e2e-orgs.yml
@ -0,0 +1,267 @@
 name: Sweep stale e2e-* orgs (staging)
 # Ported from .github/workflows/sweep-stale-e2e-orgs.yml on 2026-05-11 per RFC
 # internal#219 §1 sweep. Differences from the GitHub version:
 #   - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
 #     per feedback_gitea_workflow_dispatch_inputs_unsupported).
 #   - Dropped `merge_group:` (no Gitea merge queue).
 #   - Dropped `environment:` blocks (Gitea has no environments).
 #   - Workflow-level env.GITHUB_SERVER_URL pinned per
 #     feedback_act_runner_github_server_url.
 #   - `continue-on-error: true` on each job (RFC §1 contract).
 #
 # Janitor for staging tenants left behind when E2E cleanup didn't run:
 # CI cancellations, runner crashes, transient AWS errors mid-cascade,
 # bash trap missed (signal 9), etc. Without this loop, every failed
 # teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
 # 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
 #
 # Why not rely on per-test-run teardown:
 #   - Per-run teardown is best-effort by definition. Any process death
 #     after the test starts but before the trap fires leaves debris.
 #   - GH Actions cancellation kills the runner without grace period.
 #     The workflow's `if: always()` step usually catches this, but it
 #     too can fail (CP transient 5xx, runner network issue at the
 #     wrong moment).
 #   - Even when teardown runs, the CP cascade is best-effort in places
 #     (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
 #   - This sweep is the catch-all that converges staging back to clean
 #     regardless of which specific path leaked.
 #
 # The PROPER fix is making CP cleanup transactional + verify-after-
 # terminate (filed separately as cleanup-correctness work). This
 # workflow is the safety net that catches everything else AND any
 # future leak source we haven't yet identified.
 on:
  schedule:
    # Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from
    # create to teardown — canary is ~8 min, full SaaS ~25 min). The
    # previous hourly + 120-min stale threshold meant a leaked tenant
    # could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per
    # leak. Tightening the cadence + threshold reduces the worst-case
    # leak window from 120 min to ~45 min (15-min sweep cadence + 30-min
    # threshold) without risk of catching in-progress runs (the longest
    # e2e run is the 25-min canary, well under the 30-min threshold).
    # See molecule-controlplane#420 for the leak-class accounting that
    # motivated this tightening.
    - cron: '*/15 * * * *'
 # Don't let two sweeps fight. Cron + workflow_dispatch could overlap
 # on a manual trigger; queue rather than parallel-delete.
 concurrency:
  group: sweep-stale-e2e-orgs
  cancel-in-progress: false
 permissions:
  contents: read
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 jobs:
  sweep:
    name: Sweep e2e orgs
    runs-on: ubuntu-latest
    # NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
    # 2026-05-11. The "surface broken workflows without blocking"
    # rationale was correctly applied to advisory/lint workflows but
    # wrong for this janitor — silent failure here masks real-money
    # tenant leaks. Hongming observed 15 leaked EC2 in molecule-canary
    # (004947743811) us-east-2 at 11:05Z 2026-05-11 because the sweep
    # had been exiting 2 every tick and the failure was swallowed.
    # See `feedback_strict_root_only_after_class_a` — critical janitors
    # must fail loud. A follow-up `notify-failure` step below also
    # surfaces breakage to ops even if branch-protection wiring is
    # adjusted to keep this off the required-checks list.
    timeout-minutes: 15
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }}
      DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
      # Refuse to delete more than this many orgs in one tick. If the
      # CP DB is briefly empty (or the admin endpoint goes weird and
      # returns no created_at), every e2e- org would look stale.
      # Bailing protects against runaway nukes.
      SAFETY_CAP: 50
    steps:
      - name: Verify admin token present
        run: |
          if [ -z "$ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
            exit 2
          fi
          echo "Admin token present ✓"
      - name: Identify stale e2e orgs
        id: identify
        run: |
          set -euo pipefail
          # Fetch into a file so the python step reads it via stdin —
          # cleaner than embedding $(curl ...) into a heredoc.
          curl -sS --fail-with-body --max-time 30 \
            "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
            -H "Authorization: Bearer $ADMIN_TOKEN" \
            > orgs.json
          # Filter:
          #   1. slug starts with one of the ephemeral test prefixes:
          #        - 'e2e-'    — covers e2e-smoke- (formerly e2e-canary-),
          #                      e2e-canvas-*, etc.
          #        - 'rt-e2e-' — runtime-test harness fixtures (RFC #2251);
          #                      missing this prefix left two such tenants
          #                      orphaned 8h on staging (2026-05-03), then
          #                      hard-failed redeploy-tenants-on-staging
          #                      and broke the staging→main auto-promote
          #                      chain. Kept in sync with the EPHEMERAL_PREFIX_RE
          #                      regex in redeploy-tenants-on-staging.yml.
          #   2. created_at is older than MAX_AGE_MINUTES ago
          # Output one slug per line to a file the next step reads.
          python3 > stale_slugs.txt <<'PY'
          import json, os
          from datetime import datetime, timezone, timedelta
          # SSOT for this list lives in the controlplane Go code:
          # molecule-controlplane/internal/slugs/ephemeral.go
          # (var EphemeralPrefixes). The redeploy-fleet auto-rollout
          # also reads from there to SKIP these slugs — without that
          # filter, fleet redeploy SSM-failed in-flight E2E tenants
          # whose containers were still booting, breaking the test
          # that just spun them up (molecule-controlplane#493).
          # Update both files together.
          EPHEMERAL_PREFIXES = ("e2e-", "rt-e2e-")
          with open("orgs.json") as f:
              data = json.load(f)
          max_age = int(os.environ["MAX_AGE_MINUTES"])
          cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
          for o in data.get("orgs", []):
              slug = o.get("slug", "")
              if not slug.startswith(EPHEMERAL_PREFIXES):
                  continue
              created = o.get("created_at")
              if not created:
                  # Defensively skip rows without created_at — better
                  # to leave one orphan than nuke a brand-new row
                  # whose timestamp didn't render.
                  continue
              # Python 3.11+ handles RFC3339 with Z directly via
              # fromisoformat; older runners need the trailing Z swap.
              created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
              if created_dt < cutoff:
                  print(slug)
          PY
          count=$(wc -l < stale_slugs.txt | tr -d ' ')
          echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
          if [ "$count" -gt 0 ]; then
            echo "First 20:"
            head -20 stale_slugs.txt | sed 's/^/  /'
          fi
          echo "count=$count" >> "$GITHUB_OUTPUT"
      - name: Safety gate
        if: steps.identify.outputs.count != '0'
        run: |
          count="${{ steps.identify.outputs.count }}"
          if [ "$count" -gt "$SAFETY_CAP" ]; then
            echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
            exit 1
          fi
          echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
      - name: Delete stale orgs
        if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
        run: |
          set -uo pipefail
          deleted=0
          failed=0
          while IFS= read -r slug; do
            [ -z "$slug" ] && continue
            # The DELETE handler requires {"confirm": "<slug>"} matching
            # the URL slug — fat-finger guard. Idempotent: re-issuing
            # picks up via org_purges.last_step.
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/del_resp -w "%{http_code}" \
              --max-time 60 \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/del_code
            set -e
            # Stderr from curl (-sS shows dial errors etc.) goes to runner log.
            http_code=$(cat /tmp/del_code 2>/dev/null || echo "000")
            if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
              deleted=$((deleted+1))
              echo "  deleted: $slug"
            else
              failed=$((failed+1))
              echo "  FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
            fi
          done < stale_slugs.txt
          echo ""
          echo "Sweep summary: deleted=$deleted failed=$failed"
          # Don't fail the workflow on per-org delete errors — the
          # sweeper is best-effort. Next hourly tick re-attempts. We
          # only fail loud at the safety-cap gate above.
      - name: Sweep orphan tunnels
        # Stale-org cleanup deletes the org (which cascades to tunnel
        # delete inside the CP). But when that cascade fails partway —
        # CP transient 5xx after the org row is deleted but before the
        # CF tunnel delete completes — the tunnel persists with no
        # matching org row. The reconciler in internal/sweep flags this
        # as `cf_tunnel kind=orphan`, but nothing automatically reaps it.
        #
        # `/cp/admin/orphan-tunnels/cleanup` is the operator-triggered
        # reaper. Calling it here at the end of every sweep tick
        # converges the staging CF account to clean even when CP
        # cascades half-fail.
        #
        # PR #492 made the underlying DeleteTunnel actually check
        # status — pre-fix it silent-succeeded on CF code 1022
        # ("active connections"), so this step would have been a no-op
        # against stuck connectors. Post-fix the cleanup invokes
        # CleanupTunnelConnections + retry, which actually clears the
        # 1022 case. (#2987)
        #
        # Best-effort. Failure here doesn't fail the workflow — next
        # tick re-attempts. Errors flow to step output for ops review.
        if: env.DRY_RUN != 'true'
        run: |
          set +e
          curl -sS -o /tmp/cleanup_resp -w "%{http_code}" \
            --max-time 60 \
            -X POST "$MOLECULE_CP_URL/cp/admin/orphan-tunnels/cleanup" \
            -H "Authorization: Bearer $ADMIN_TOKEN" >/tmp/cleanup_code
          set -e
          http_code=$(cat /tmp/cleanup_code 2>/dev/null || echo "000")
          body=$(cat /tmp/cleanup_resp 2>/dev/null | head -c 500)
          if [ "$http_code" = "200" ]; then
            count=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(d.get('deleted_count', 0))" 2>/dev/null || echo "0")
            failed_n=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(len(d.get('failed') or {}))" 2>/dev/null || echo "0")
            echo "Orphan-tunnel sweep: deleted=$count failed=$failed_n"
          else
            echo "::warning::orphan-tunnels cleanup returned HTTP $http_code — body: $body"
          fi
      - name: Dry-run summary
        if: env.DRY_RUN == 'true'
        run: |
          echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete."
      - name: Notify on sweep failure
        # Fail-loud companion to dropping `continue-on-error: true`.
        # If any prior step failed (missing token, CP 5xx, safety-cap
        # tripped, etc.) emit a clearly-tagged ::error:: line so the
        # Gitea runs UI + any log-tail consumer (Loki SOPRefireRule)
        # flags this. Without this step, an early `exit 2` shows as a
        # red run but the message can scroll past in busy log windows;
        # the explicit tag here is greppable from the orchestrator
        # triage loop.
        if: failure()
        run: |
          echo "::error::sweep-stale-e2e-orgs FAILED — staging tenants are LEAKING. See prior step logs. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) safety-cap tripped (CP admin API returning malformed orgs). Manual cleanup of leaked EC2 + DNS may be required while this is broken."
          exit 1
--- a/.gitea/workflows/test-ops-scripts.yml
+++ b/.gitea/workflows/test-ops-scripts.yml
@ -0,0 +1,65 @@
 name: Ops Scripts Tests
 # Ported from .github/workflows/test-ops-scripts.yml on 2026-05-11 per
 # RFC internal#219 §1 sweep.
 #
 # Differences from the GitHub version:
 #   - Dropped `merge_group:` trigger (no Gitea merge queue).
 #   - on.paths references .gitea/workflows/test-ops-scripts.yml (this
 #     file) instead of the .github/ one.
 #   - Workflow-level env.GITHUB_SERVER_URL set.
 #   - `continue-on-error: true` on the job (RFC §1 contract).
 #
 # Runs the unittest suite for scripts/ on every PR + push that touches
 # anything under scripts/. Kept separate from the main CI so a script-only
 # change doesn't trigger the heavier Go/Canvas/Python pipelines.
 #
 # Discovery layout: tests sit alongside the code they test (see
 # scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/
 # test_build_runtime_package.py for the rewriter coverage). The job
 # below runs `unittest discover` TWICE — once from `scripts/`, once
 # from `scripts/ops/` — because neither dir has an `__init__.py`, so
 # a single discover from `scripts/` doesn't recurse into the ops
 # subdir. Two passes is simpler than retrofitting namespace packages.
 on:
  push:
    branches: [main, staging]
    paths:
      - 'scripts/**'
      - '.gitea/workflows/test-ops-scripts.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - 'scripts/**'
      - '.gitea/workflows/test-ops-scripts.yml'
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test:
    name: Ops scripts (unittest)
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
      - name: Run scripts/ unittests (build_runtime_package, ...)
        # Top-level scripts/ tests live alongside their target file
        # (e.g. scripts/test_build_runtime_package.py exercises
        # scripts/build_runtime_package.py). discover from scripts/
        # picks up only top-level test_*.py because scripts/ops/ has
        # no __init__.py — that's intentional, so we run two passes.
        working-directory: scripts
        run: python -m unittest discover -t . -p 'test_*.py' -v
      - name: Run scripts/ops/ unittests (sweep_cf_decide, ...)
        working-directory: scripts/ops
        run: python -m unittest discover -p 'test_*.py' -v
--- a/.github/scripts/lint_secret_pattern_drift.py
+++ b/.github/scripts/lint_secret_pattern_drift.py
@ -28,7 +28,7 @@ import sys
 import urllib.request
 from pathlib import Path
-CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
+CANONICAL_FILE = Path(".gitea/workflows/secret-scan.yml")
 # Public consumer mirrors. Each entry is (label, raw_url) — raw_url
 # points at the file's RAW content on the consumer's default branch
--- a/.github/workflows/auto-promote-on-e2e.yml
+++ b/.github/workflows/auto-promote-on-e2e.yml
@ -1,467 +0,0 @@
 name: Auto-promote :latest after main image build
 # Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-<sha>`
 # → `:latest` after either the image build or E2E completes on a `main`
 # push, gated on E2E Staging SaaS not being red for that SHA.
 #
 # Why two triggers:
 #
 #   `publish-workspace-server-image` and `e2e-staging-saas` are both
 #   paths-filtered, but with DIFFERENT path sets:
 #
 #     publish-workspace-server-image:
 #       workspace-server/**, canvas/**, manifest.json
 #
 #     e2e-staging-saas (full lifecycle):
 #       workspace-server/internal/handlers/{registry,workspace_provision,
 #       a2a_proxy}.go, workspace-server/internal/middleware/**,
 #       workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh
 #
 #   The E2E set is a strict SUBSET of the publish set. So:
 #     - canvas/** changes → publish fires, E2E does not
 #     - workspace-server/cmd/** changes → publish fires, E2E does not
 #     - workspace-server/internal/sweep/** → publish fires, E2E does not
 #
 #   The previous version triggered ONLY on E2E completion, which meant
 #   non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image
 #   but never advanced `:latest`. Result: as of 2026-04-28 this workflow
 #   had run zero times since merge despite eight main pushes — `:latest`
 #   was ~7 hours / 9 PRs behind main with no human realising. See
 #   `molecule-core` Slack discussion 2026-04-28.
 #
 #   Adding `publish-workspace-server-image` as a second trigger closes
 #   the gap: any image rebuild on main eligibly advances `:latest`.
 #
 # Why E2E remains a kill-switch (not the trigger):
 #
 #   When E2E DID run for this SHA and ended red, we abort — `:latest`
 #   stays on the prior known-good digest. When E2E didn't run (paths
 #   filtered out), we proceed: pre-merge gates already validated this
 #   SHA on staging via auto-promote-staging requiring CI + E2E Canvas +
 #   E2E API + CodeQL all green. Image content for non-E2E-paths
 #   (canvas, cmd, sweep) is exercised by those staging gates.
 #
 # Why `main` only:
 #
 #   `:latest` is what prod tenants pull. We only want SHAs that have
 #   reached main (via auto-promote-staging) to advance `:latest`.
 #   Triggering on staging would let a staging-only revert advance
 #   `:latest` to a SHA that never reaches main, breaking the "production
 #   runs what's on main" invariant.
 #
 # Idempotency:
 #
 #   When a SHA touches paths that match BOTH publish and E2E, both
 #   workflows fire and complete. Both trigger this workflow on
 #   completion → two runs race. Both retag `:staging-<sha>` →
 #   `:latest`. crane tag is idempotent (re-tagging the same digest is a
 #   no-op), so the second run is harmless. concurrency group serializes
 #   them anyway.
 on:
  workflow_run:
    workflows:
      - 'E2E Staging SaaS (full lifecycle)'
      - 'publish-workspace-server-image'
    types: [completed]
    branches: [main]
  workflow_dispatch:
    inputs:
      sha:
        description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)'
        required: false
        type: string
 permissions:
  contents: read
  packages: write
 concurrency:
  # Serialize promotes per-SHA so the publish+E2E both-fired race lands
  # cleanly. Different SHAs can promote in parallel.
  group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }}
  cancel-in-progress: false
 env:
  IMAGE_NAME: ghcr.io/molecule-ai/platform
  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
 jobs:
  promote:
    # Proceed if upstream succeeded OR manual dispatch. Upstream-failure
    # paths are filtered here; the E2E-was-red kill-switch lives in the
    # gate-check step below (covers the case where upstream is publish
    # success but E2E for the same SHA failed).
    if: |
      github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
    runs-on: ubuntu-latest
    steps:
      - name: Compute short sha
        id: sha
        run: |
          set -euo pipefail
          if [ -n "${{ github.event.inputs.sha }}" ]; then
            FULL="${{ github.event.inputs.sha }}"
          else
            FULL="${{ github.event.workflow_run.head_sha }}"
          fi
          echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT"
          echo "full=${FULL}" >> "$GITHUB_OUTPUT"
      - name: Gate — E2E Staging SaaS state for this SHA
        # When upstream IS E2E success, we know it's green (filtered by
        # the job-level `if` already). When upstream is publish, look up
        # E2E state for the same SHA. Four buckets:
        #
        #   - completed/success: E2E confirmed safe → proceed
        #   - completed/failure|cancelled|timed_out: E2E found a
        #     regression → ABORT (exit 1), `:latest` stays put
        #   - in_progress|queued|requested: E2E is RACING with publish
        #     for a runtime-touching SHA. publish typically completes
        #     ~5-10min before E2E (~10-15min). If we promote on the
        #     publish signal here, a later E2E failure can't roll back
        #     `:latest` — it'd already be wrongly advanced. So we DEFER:
        #     skip subsequent steps (proceed=false) and let E2E's own
        #     completion event re-fire this workflow, which then takes
        #     the upstream-is-E2E path. exit 0 so the run shows as
        #     success rather than a noisy fake-failure.
        #   - none/none: E2E was paths-filtered out for this SHA (the
        #     change touched canvas/cmd/sweep/etc. — paths covered by
        #     publish but not by E2E). pre-merge gates on staging
        #     already validated this SHA → proceed.
        #
        # Manual dispatch skips this check — operator override.
        id: gate
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SHA: ${{ steps.sha.outputs.full }}
          UPSTREAM_NAME: ${{ github.event.workflow_run.name }}
          EVENT_NAME: ${{ github.event_name }}
        run: |
          set -euo pipefail
          if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
            echo "proceed=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Manual dispatch — skipping E2E gate (operator override)"
            exit 0
          fi
          if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then
            echo "proceed=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied"
            exit 0
          fi
          # Upstream is publish-workspace-server-image. Check E2E state
          # for the same SHA via Gitea's commit-status API.
          #
          # GitHub-era this was `gh run list --workflow=X --commit=SHA
          # --json status,conclusion` returning either `[]` (no run on
          # this SHA) or `[{status, conclusion}]` (the run's state).
          # Gitea has NO workflow-runs API at all — `/api/v1/repos/.../
          # actions/runs` returns 404 (verified 2026-05-07, issue #75).
          # However Gitea Actions DOES emit a commit status per workflow
          # job, with `context = "<Workflow Name> / <Job Name> (<event>)"`,
          # which is exactly what we need: each E2E run leg becomes one
          # status row on the SHA, and the aggregate state encodes the
          # run's outcome.
          #
          # Mapping:
          #   0 matched contexts          → "none/none"      (E2E paths-
          #                                                    filtered
          #                                                    out — same
          #                                                    semantic
          #                                                    as before)
          #   any context = pending       → "in_progress/none" (defer)
          #   any context = error|failure → "completed/failure" (abort)
          #   all contexts = success      → "completed/success" (proceed)
          #
          # The "completed/cancelled" and "completed/timed_out" buckets
          # don't have direct Gitea analogs (Gitea statuses are
          # success / failure / error / pending / warning). Per-SHA
          # concurrency cancellation surfaces as `error` on Gitea, which
          # we map to "completed/failure" rather than "completed/cancelled"
          # — losing the soft-defer semantic of the cancelled bucket on
          # this fleet. Tradeoff: the staleness alarm (auto-promote-stale-
          # alarm.yml) still catches a stuck :latest within 4h, and a
          # legitimate cancel is rare enough that aborting + manual
          # re-dispatch is acceptable. If we measure cancel frequency
          # > 1/week, revisit by reading the run-step-summary text via
          # a follow-up script.
          #
          # Network or auth blips collapse to "none/none" via the curl
          # `|| true` fallback, matching the pre-Gitea behaviour where
          # an empty list also degenerated to none/none.
          GITEA_API_URL="${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1"
          STATUSES_JSON=$(curl --fail-with-body -sS \
            -H "Authorization: token ${GH_TOKEN}" \
            -H "Accept: application/json" \
            "${GITEA_API_URL}/repos/${REPO}/commits/${SHA}/statuses?limit=100" \
            2>/dev/null || echo "[]")
          RESULT=$(printf '%s' "$STATUSES_JSON" | jq -r '
            # Filter to E2E Staging SaaS (full lifecycle) statuses.
            # Match by leading workflow-name prefix so the "<job>
            # (<event>)" tail is irrelevant. Gitea emits the workflow
            # name verbatim from the YAML `name:` field.
            [.[] | select(.context | startswith("E2E Staging SaaS (full lifecycle) /"))] as $rows
            | if ($rows | length) == 0 then
                "none/none"
              elif any($rows[]; .status == "pending") then
                "in_progress/none"
              elif any($rows[]; .status == "failure" or .status == "error") then
                "completed/failure"
              elif all($rows[]; .status == "success") then
                "completed/success"
              else
                # Mixed / unknown — fall through to *) bucket below.
                "completed/" + ($rows[0].status // "unknown")
              end
          ' 2>/dev/null || echo "none/none")
          echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT"
          case "$RESULT" in
            completed/success)
              echo "proceed=true" >> "$GITHUB_OUTPUT"
              echo "::notice::E2E green for this SHA — proceeding with promote"
              ;;
            completed/failure|completed/timed_out)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`"
                echo "\`:latest\` stays on the prior known-good digest."
                echo
                echo "If the failure was a flake, manually dispatch this workflow with the same sha to override."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
            completed/cancelled)
              # GitHub-era only: cancelled ≠ failure. Gitea statuses
              # don't expose a "cancelled" state — a per-SHA concurrency
              # cancellation surfaces as `failure` or `error` on Gitea
              # and is now handled by the failure branch above. This
              # arm is kept for backwards compatibility / dual-host
              # operation (if we ever add a non-Gitea fallback) but
              # under the post-#75 flow it's unreachable.
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`"
                echo "Likely per-SHA concurrency (newer push superseded this E2E run)."
                echo "The newer SHA's E2E will fire its own promote when it lands."
                echo "If you need this specific SHA promoted, manually dispatch."
              } >> "$GITHUB_STEP_SUMMARY"
              ;;
            in_progress/*|queued/*|requested/*|waiting/*|pending/*)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running"
                echo
                echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)."
                echo "Skipping retag here — E2E's own completion event will re-fire this workflow."
                echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts."
              } >> "$GITHUB_STEP_SUMMARY"
              ;;
            none/none)
              echo "proceed=true" >> "$GITHUB_OUTPUT"
              echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry"
              ;;
            *)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ❓ Auto-promote aborted — unexpected E2E state"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)"
                echo "Manual investigation needed; re-dispatch with the same sha once resolved."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
          esac
      - if: steps.gate.outputs.proceed == 'true'
        uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
      - name: GHCR login
        if: steps.gate.outputs.proceed == 'true'
        run: |
          echo "${{ secrets.GITHUB_TOKEN }}" | \
            crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
      - name: Verify :staging-<sha> exists for both images
        # Better to fail fast with a clear message than to half-tag
        # (platform retagged but platform-tenant missing → tenants pull
        # a stale image).
        if: steps.gate.outputs.proceed == 'true'
        run: |
          set -euo pipefail
          for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do
            tag="${img}:staging-${{ steps.sha.outputs.short }}"
            if ! crane manifest "$tag" >/dev/null 2>&1; then
              echo "::error::Missing tag: $tag"
              echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest."
              exit 1
            fi
            echo "  ok: $tag exists"
          done
      - name: Ancestry check — refuse to promote :latest backwards
        # #2244: workflow_run completions arrive in arbitrary order. If
        # SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E
        # completes before SHA-A's, this workflow can fire for SHA-A
        # AFTER it already promoted SHA-B → :latest goes backwards. The
        # orphan-reconciler "next run corrects it" doesn't apply: there's
        # no auto-corrective re-promote, :latest stays wrong until the
        # next main push lands.
        #
        # Detection: read current :latest's `org.opencontainers.image.revision`
        # label (set by publish-workspace-server-image.yml at build time)
        # and ask the GitHub compare API whether the candidate SHA is
        # ahead-of / identical-to / behind / diverged-from current.
        # Hard-fail on `behind` and `diverged` per the approved design —
        # silent-bypass is the class we're moving away from. Workflow
        # goes red, oncall sees it, operator decides how to recover
        # (manual dispatch with the right SHA, force-promote, etc.).
        #
        # Manual dispatch skips this check — operator override semantics
        # match the gate-check step above.
        #
        # Backward-compat: when current :latest carries no revision
        # label (legacy image pre-publish-with-label), skip-with-warning.
        # All :latest images on main are post-label as of 2026-04-29, so
        # this branch will be dead within 90 days; remove then.
        if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch'
        id: ancestry
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          TARGET_SHA: ${{ steps.sha.outputs.full }}
        run: |
          set -euo pipefail
          # Read the current :latest config and pull the revision label.
          # `crane config` returns the OCI image config blob (not the manifest);
          # labels live under `.config.Labels`. `// empty` makes jq return ""
          # rather than the literal "null" so the test below works.
          CURRENT_REVISION=$(crane config "${IMAGE_NAME}:latest" 2>/dev/null \
            | jq -r '.config.Labels["org.opencontainers.image.revision"] // empty' \
            || true)
          if [ -z "$CURRENT_REVISION" ]; then
            echo "decision=skip-no-label" >> "$GITHUB_OUTPUT"
            {
              echo "## ⚠ Ancestry check skipped — current :latest has no revision label"
              echo
              echo "Likely a legacy image built before \`org.opencontainers.image.revision\` was set."
              echo "Falling through to retag. After all \`:latest\` images are post-label (TODO 90 days), this branch is dead and should be removed."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::warning::Current :latest carries no revision label — skipping ancestry check (legacy image)"
            exit 0
          fi
          if [ "$CURRENT_REVISION" = "$TARGET_SHA" ]; then
            echo "decision=identical" >> "$GITHUB_OUTPUT"
            echo "::notice:::latest already at ${TARGET_SHA:0:7} — retag will be a no-op"
            exit 0
          fi
          # Ask GitHub which side of the merge graph TARGET_SHA sits on
          # relative to CURRENT_REVISION. Returns one of: ahead | identical
          # | behind | diverged. Network or auth errors collapse to "error"
          # via the explicit fallback so the case below always matches.
          STATUS=$(gh api \
            "repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \
            --jq '.status' 2>/dev/null || echo "error")
          echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS"
          case "$STATUS" in
            ahead)
              echo "decision=ahead" >> "$GITHUB_OUTPUT"
              echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag"
              ;;
            identical)
              echo "decision=identical" >> "$GITHUB_OUTPUT"
              echo "::notice::Target identical to :latest — retag will be a no-op"
              ;;
            behind)
              echo "decision=behind" >> "$GITHUB_OUTPUT"
              {
                echo "## ❌ Auto-promote refused — target is BEHIND current :latest"
                echo
                echo "| Field | Value |"
                echo "|---|---|"
                echo "| Target SHA | \`$TARGET_SHA\` |"
                echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
                echo "| GitHub compare status | \`behind\` |"
                echo
                echo "This guard catches the workflow_run-completion-order race (#2244):"
                echo "two rapid main pushes whose E2Es complete out-of-order can otherwise"
                echo "promote \`:latest\` backwards. \`:latest\` stays on \`${CURRENT_REVISION:0:7}\`."
                echo
                echo "**Recovery:** if this is a legitimate revert that should land on \`:latest\`,"
                echo "manually dispatch this workflow with the target sha as input — the manual-dispatch"
                echo "path skips the ancestry check (operator override)."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
            diverged)
              echo "decision=diverged" >> "$GITHUB_OUTPUT"
              {
                echo "## ❓ Auto-promote refused — history diverged"
                echo
                echo "| Field | Value |"
                echo "|---|---|"
                echo "| Target SHA | \`$TARGET_SHA\` |"
                echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
                echo "| GitHub compare status | \`diverged\` |"
                echo
                echo "Likely cause: force-push rewrote main's history, leaving the previous"
                echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
            error|*)
              echo "decision=error" >> "$GITHUB_OUTPUT"
              {
                echo "## ❌ Auto-promote aborted — ancestry-check API error"
                echo
                echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`"
                echo
                echo "Manual dispatch with the target sha bypasses this check."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
          esac
      - name: Retag platform :staging-<sha> → :latest
        if: steps.gate.outputs.proceed == 'true'
        run: |
          crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
      - name: Retag tenant :staging-<sha> → :latest
        if: steps.gate.outputs.proceed == 'true'
        run: |
          crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
      - name: Summary
        if: steps.gate.outputs.proceed == 'true'
        run: |
          {
            echo "## :latest promoted to ${{ steps.sha.outputs.short }}"
            echo
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "- Trigger: manual dispatch"
            else
              echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))"
            fi
            echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest"
            echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest"
            echo
            echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true."
            echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/auto-promote-staging.yml
+++ b/.github/workflows/auto-promote-staging.yml
@ -1,492 +0,0 @@
 name: Auto-promote staging → main
 # Fires after any of the staging-branch quality gates complete. When ALL
 # required gates are green on the same staging SHA, opens (or re-uses)
 # a PR `staging → main` and schedules Gitea auto-merge so the PR lands
 # automatically once approval + status checks are satisfied.
 #
 # ============================================================
 # What this workflow does
 # ============================================================
 #
 # 1. On a workflow_run completion event for one of the staging gate
 #    workflows (CI, E2E Staging Canvas, E2E API Smoke, CodeQL),
 #    checks if the combined status on the staging head SHA is green.
 # 2. If green, opens (or re-uses) a PR `head: staging → base: main`
 #    via Gitea REST `POST /api/v1/repos/.../pulls`.
 # 3. Schedules auto-merge via `POST /api/v1/repos/.../pulls/{index}/merge`
 #    with `merge_when_checks_succeed: true`. Gitea waits for the
 #    approval requirement on `main` (`required_approvals: 1`) and
 #    the status-check gates, then merges.
 # 4. The merge commit lands on `main` and fires
 #    `publish-workspace-server-image.yml` naturally via its
 #    `on: push: branches: [main]` trigger — no explicit dispatch
 #    needed (see "Why no workflow_dispatch tail" below).
 #
 # `auto-sync-main-to-staging.yml` is the reverse-direction
 # counterpart (main → staging, fast-forward push). Together they
 # keep the staging-superset-of-main invariant tight.
 #
 # ============================================================
 # Why Gitea REST (and not `gh pr create`)
 # ============================================================
 #
 # Pre-2026-05-06 this workflow used `gh pr create`, `gh pr merge --auto`,
 # `gh run list`, and `gh workflow run` against GitHub. After the
 # GitHub→Gitea cutover those calls fail because:
 #
 #   - `gh pr create / merge / view / list` route to GitHub GraphQL
 #     (`/api/graphql`). Gitea does not expose a GraphQL endpoint;
 #     every call returns `HTTP 405 Method Not Allowed` — same root
 #     cause as #65 (auto-sync) which PR #66 fixed by dropping `gh`
 #     entirely.
 #   - `gh run list --workflow=...` GitHub-shape; Gitea has the
 #     simpler `GET /repos/.../commits/{ref}/status` combined-status
 #     endpoint instead.
 #   - `gh workflow run X.yml` calls `POST /repos/.../actions/workflows/{id}/dispatches`,
 #     which does NOT exist on Gitea 1.22.6 (verified via swagger.v1.json).
 #
 # So this workflow uses direct `curl` calls to Gitea REST. No `gh`
 # CLI dependency, no GraphQL, no missing-endpoint footgun.
 #
 # ============================================================
 # Why no workflow_dispatch tail (was load-bearing on GitHub, dead on Gitea)
 # ============================================================
 #
 # The GitHub-era version had a 60-line polling step that waited for
 # the promote PR to merge, then explicitly dispatched
 # `publish-workspace-server-image.yml` on `--ref main`. That step
 # existed because GitHub's GITHUB_TOKEN-initiated merges suppress
 # downstream `on: push` workflows (the documented "no recursion" rule
 # — https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
 # The explicit dispatch was the workaround.
 #
 # Gitea Actions does NOT have this no-recursion rule. PR #66's auto-
 # sync merge to main fired `auto-promote-staging` on the next push
 # trigger naturally. So the cascade fires on the natural push event;
 # the explicit dispatch is dead code. (And even if we wanted to
 # preserve it, Gitea has no `workflow_dispatch` REST endpoint.)
 #
 # Removed in this rewrite. If we ever observe the cascade misfire,
 # operator can push an empty commit to `main` to wake it.
 #
 # ============================================================
 # Why open a PR (and not direct push)
 # ============================================================
 #
 # `main` branch protection has `enable_push: false` with NO
 # `push_whitelist_usernames`. Direct push is impossible for any
 # persona, including admins. PR-mediated merge is the only path,
 # which is intentional: prod state mutations (and staging→main IS a
 # prod mutation, since the next deploy fans out to tenants) require
 # Hongming's approval per `feedback_prod_apply_needs_hongming_chat_go`.
 #
 # The auto-merge schedule preserves this gate: `merge_when_checks_succeed`
 # does NOT bypass `required_approvals: 1`. Gitea waits for BOTH
 # approval AND green checks before merging. Hongming reviews via the
 # canvas/chat-handle of the PR notification, approves, and Gitea
 # auto-merges within seconds.
 #
 # ============================================================
 # Identity + token (anti-bot-ring per saved-memory
 # `feedback_per_agent_gitea_identity_default`)
 # ============================================================
 #
 # This workflow uses `secrets.AUTO_SYNC_TOKEN` — a personal access
 # token issued to the `devops-engineer` Gitea persona. NOT the
 # founder PAT. The bot-ring fingerprint that triggered the GitHub
 # org suspension on 2026-05-06 was characterised by founder PAT
 # acting as CI at machine speed.
 #
 # Token scope: `push: true` (read+write) on this repo. The persona
 # can: open PRs, comment on PRs, schedule auto-merge. The persona
 # CANNOT bypass main's branch protection (`required_approvals: 1`
 # still applies — only Hongming's review unblocks merge).
 #
 # Authorship: the PR is opened by `devops-engineer`; the merge
 # commit credits Hongming-as-approver and `devops-engineer` as
 # the merger.
 #
 # ============================================================
 # Failure modes & operational notes
 # ============================================================
 #
 # A — staging gates not all green at trigger time:
 #     - The combined-status check returns `state: pending|failure`.
 #       Workflow exits 0 with a step-summary "not all green; staying
 #       on current main". Re-fires on the next gate completion.
 #
 # B — Gitea PR-create returns non-201 (e.g. 422 already-exists):
 #     - Idempotent: the workflow first GETs the existing open
 #       staging→main PR. If found, reuse it; if not, POST a new one.
 #       422 should never surface; if it does (race), step summary
 #       captures the body and the next workflow_run picks up.
 #
 # C — `merge_when_checks_succeed` schedule fails:
 #     - 422 with "Pull request is not mergeable" if there are
 #       conflicts or stale base. Step summary surfaces it; operator
 #       (or `auto-sync-main-to-staging`) needs to bring staging up
 #       to date with main first. Workflow exits 1 to surface red.
 #
 # D — `AUTO_SYNC_TOKEN` rotated / wrong scope:
 #     - 401/403 on first REST call. Step summary surfaces it.
 #       Re-issue the token from `~/.molecule-ai/personas/` on the
 #       operator host and update the repo Actions secret.
 #
 # ============================================================
 # Loop safety
 # ============================================================
 #
 # When the promote PR merges to main, `auto-sync-main-to-staging.yml`
 # fires (on:push:main) and pushes the merge commit back to staging.
 # That push to staging is by `devops-engineer`, NOT this workflow's
 # token, and triggers the staging gate workflows. When they all
 # complete, we end up back here — but the tree-diff guard catches
 # it: staging tree == main tree (the merge commit changes nothing),
 # so we skip and the cycle terminates.
 on:
  workflow_run:
    workflows:
      - CI
      - E2E Staging Canvas (Playwright)
      - E2E API Smoke Test
      - CodeQL
    types: [completed]
  workflow_dispatch:
    inputs:
      force:
        description: "Force promote even when AUTO_PROMOTE_ENABLED is unset (manual override)"
        required: false
        default: "false"
 permissions:
  contents: read
  pull-requests: write
 # Serialize auto-promote runs. Multiple staging gate completions can land
 # in quick succession (CI + E2E + CodeQL all finish within seconds of
 # each other on a green PR) — without this, two parallel runs both:
 #   1. Would race the GET-or-POST PR step.
 #   2. Would both call merge-schedule (idempotent — fine on Gitea).
 # cancel-in-progress: false because the second run on a fresh staging
 # tip should NOT kill the first which has already opened the PR.
 concurrency:
  group: auto-promote-staging
  cancel-in-progress: false
 jobs:
  check-all-gates-green:
    # Only consider staging pushes. PRs into staging don't promote.
    if: >
      (github.event_name == 'workflow_run' &&
       github.event.workflow_run.head_branch == 'staging' &&
       github.event.workflow_run.event == 'push')
      || github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      all_green: ${{ steps.gates.outputs.all_green }}
      head_sha: ${{ steps.gates.outputs.head_sha }}
    steps:
      # Skip empty-tree promotes (the perpetual auto-promote↔auto-sync
      # cycle observed pre-cutover on GitHub). On Gitea the cycle shape
      # is different (auto-sync uses fast-forward, no merge commit),
      # but the tree-diff guard is cheap insurance and protects against
      # any future merge-style regression.
      - name: Checkout for tree-diff check
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          ref: staging
      - name: Skip if staging tree == main tree (cycle-break safety)
        id: tree-diff
        env:
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
        run: |
          set -eu
          git fetch origin main --depth=50 || { echo "::warning::git fetch main failed — proceeding (fail-open)"; exit 0; }
          if git diff --quiet origin/main "$HEAD_SHA" -- 2>/dev/null; then
            {
              echo "## Skipped — no code to promote"
              echo
              echo "staging tip (\`${HEAD_SHA:0:8}\`) and \`main\` have identical trees."
              echo "Skipping to avoid opening an empty promote PR."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::notice::auto-promote: staging tree == main tree — no code to promote, skipping"
            echo "skip=true" >> "$GITHUB_OUTPUT"
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi
      - name: Check combined status on staging head
        if: steps.tree-diff.outputs.skip != 'true'
        id: gates
        env:
          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          REPO: ${{ github.repository }}
          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
        run: |
          set -euo pipefail
          # Gitea-native combined-status endpoint aggregates every
          # check context attached to a SHA. This is structurally
          # cleaner than the GitHub-era per-workflow `gh run list`
          # loop because:
          #
          #   1. There's no risk of "workflow name collision" (the
          #      GitHub-era code had to switch from `--workflow=NAME`
          #      to `--workflow=FILE.YML` to disambiguate "CodeQL"
          #      between the explicit workflow and GitHub's UI-
          #      configured default setup; Gitea has no such
          #      duplicate-name surface).
          #   2. Gitea's combined state already encodes the AND
          #      across all contexts: success only if EVERY context
          #      is success. Pending or failure on any context
          #      produces non-success state.
          #
          # See https://docs.gitea.com/api/1.22 for the schema —
          # `state` is one of: success, pending, failure, error.
          echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT"
          echo "Checking combined status on SHA ${HEAD_SHA}"
          # `set +o pipefail` for the http-code capture pattern; restore
          # immediately. Pattern hardened per `feedback_curl_status_capture_pollution`.
          BODY_FILE=$(mktemp)
          set +e
          STATUS=$(curl -sS \
            -H "Authorization: token ${GITEA_TOKEN}" \
            -H "Accept: application/json" \
            -o "${BODY_FILE}" \
            -w "%{http_code}" \
            "${GITEA_HOST}/api/v1/repos/${REPO}/commits/${HEAD_SHA}/status")
          CURL_RC=$?
          set -e
          if [ "${CURL_RC}" -ne 0 ] || [ "${STATUS}" != "200" ]; then
            echo "::error::combined-status fetch failed: curl=${CURL_RC} http=${STATUS}"
            cat "${BODY_FILE}" | head -c 500 || true
            rm -f "${BODY_FILE}"
            echo "all_green=false" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          STATE=$(jq -r '.state // "missing"' < "${BODY_FILE}")
          TOTAL=$(jq -r '.total_count // 0' < "${BODY_FILE}")
          rm -f "${BODY_FILE}"
          echo "Combined status: state=${STATE} total_count=${TOTAL}"
          if [ "${STATE}" = "success" ] && [ "${TOTAL}" -gt 0 ]; then
            echo "all_green=true" >> "$GITHUB_OUTPUT"
            echo "::notice::All gates green on ${HEAD_SHA} (${TOTAL} contexts)"
          else
            echo "all_green=false" >> "$GITHUB_OUTPUT"
            {
              echo "## Not promoting — combined status not green"
              echo
              echo "- SHA: \`${HEAD_SHA:0:8}\`"
              echo "- Combined state: \`${STATE}\`"
              echo "- Context count: ${TOTAL}"
              echo
              echo "Will re-fire on the next gate completion. Investigate any red gate via the Actions UI."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::notice::auto-promote: combined status is ${STATE} on ${HEAD_SHA} — staying on current main"
          fi
  promote:
    needs: check-all-gates-green
    if: needs.check-all-gates-green.outputs.all_green == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Check rollout gate
        env:
          AUTO_PROMOTE_ENABLED: ${{ vars.AUTO_PROMOTE_ENABLED }}
          FORCE_INPUT: ${{ github.event.inputs.force }}
        run: |
          set -eu
          # Repo variable AUTO_PROMOTE_ENABLED=true flips this on. While
          # it's unset, the workflow dry-runs (logs what it would have
          # done) but doesn't open the promote PR. Set the variable in
          # Settings → Actions → Variables.
          if [ "${AUTO_PROMOTE_ENABLED:-}" != "true" ] && [ "${FORCE_INPUT:-false}" != "true" ]; then
            {
              echo "## Auto-promote disabled"
              echo
              echo "Repo variable \`AUTO_PROMOTE_ENABLED\` is not set to \`true\`."
              echo "All gates are green on staging; would have opened a promote PR to \`main\`."
              echo
              echo "To enable: Settings → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`."
              echo "To test once manually: workflow_dispatch with \`force=true\`."
            } >> "$GITHUB_STEP_SUMMARY"
            echo "::notice::auto-promote disabled — dry run only"
            exit 0
          fi
      - name: Open or reuse promote PR + schedule auto-merge
        if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
        env:
          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
          REPO: ${{ github.repository }}
          TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }}
          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
        run: |
          set -euo pipefail
          API="${GITEA_HOST}/api/v1/repos/${REPO}"
          AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
          # http_status_get RESULT_VAR URL
          # Sets RESULT_VAR to "<http_code>:<body_file>". Curl status
          # capture pattern per `feedback_curl_status_capture_pollution`:
          # http_code goes to its own tempfile-equivalent (-w), body to
          # another tempfile, set +e/-e bracket protects pipeline state.
          http_get() {
            local body_file="$1"; shift
            local url="$1"; shift
            set +e
            local code
            code=$(curl -sS "${AUTH[@]}" -o "${body_file}" -w "%{http_code}" "${url}")
            local rc=$?
            set -e
            if [ "${rc}" -ne 0 ]; then
              echo "::error::curl GET failed (rc=${rc}) on ${url}"
              return 99
            fi
            echo "${code}"
          }
          http_post_json() {
            local body_file="$1"; shift
            local data="$1"; shift
            local url="$1"; shift
            set +e
            local code
            code=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
              -X POST -d "${data}" -o "${body_file}" -w "%{http_code}" "${url}")
            local rc=$?
            set -e
            if [ "${rc}" -ne 0 ]; then
              echo "::error::curl POST failed (rc=${rc}) on ${url}"
              return 99
            fi
            echo "${code}"
          }
          # Step 1: look for an existing open staging→main promote PR
          # (idempotent on workflow re-run). Gitea doesn't have a
          # head/base filter on the list endpoint that's as ergonomic
          # as gh's, but the dedicated `/pulls/{base}/{head}` lookup
          # works.
          BODY=$(mktemp)
          STATUS=$(http_get "${BODY}" "${API}/pulls/main/staging") || true
          PR_NUM=""
          if [ "${STATUS}" = "200" ]; then
            STATE=$(jq -r '.state // "missing"' < "${BODY}")
            if [ "${STATE}" = "open" ]; then
              PR_NUM=$(jq -r '.number // ""' < "${BODY}")
              echo "::notice::Re-using existing open promote PR #${PR_NUM}"
            fi
          fi
          rm -f "${BODY}"
          # Step 2: if no open PR, create one.
          if [ -z "${PR_NUM}" ]; then
            TITLE="staging → main: auto-promote ${TARGET_SHA:0:7}"
            BODY_TEXT=$(cat <<EOFBODY
          Automated promotion of \`staging\` (\`${TARGET_SHA:0:8}\`) to \`main\`. All required staging gates are green at this SHA (combined status reported success).
          This PR is auto-generated by \`.github/workflows/auto-promote-staging.yml\` whenever every required gate completes green on the same staging SHA.
          **Approval gate:** \`main\` branch protection requires 1 approval before this can land. Once approved, Gitea will auto-merge (the workflow scheduled \`merge_when_checks_succeed: true\` immediately after open).
          The reverse-direction sync (the merge commit on \`main\` → \`staging\`) is handled automatically by \`auto-sync-main-to-staging.yml\` after this PR lands.
          ---
          - Source: staging at \`${TARGET_SHA}\`
          - Opened by: \`devops-engineer\` persona (anti-bot-ring; never founder PAT)
          - Refs: #65, #73, #195
          EOFBODY
          )
            REQ=$(jq -n \
              --arg title "${TITLE}" \
              --arg body "${BODY_TEXT}" \
              --arg base "main" \
              --arg head "staging" \
              '{title:$title, body:$body, base:$base, head:$head}')
            BODY=$(mktemp)
            STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls")
            if [ "${STATUS}" = "201" ]; then
              PR_NUM=$(jq -r '.number // ""' < "${BODY}")
              echo "::notice::Opened promote PR #${PR_NUM}"
            else
              echo "::error::Failed to create promote PR: HTTP ${STATUS}"
              jq -r '.message // .' < "${BODY}" | head -c 500
              rm -f "${BODY}"
              exit 1
            fi
            rm -f "${BODY}"
          fi
          # Step 3: schedule auto-merge. merge_when_checks_succeed
          # tells Gitea to wait for both:
          #   - all required status checks to pass
          #   - the required-approvals gate (1 approval on main)
          # before merging. On approval+green, Gitea merges within
          # seconds. On any check failing or approval being denied,
          # the schedule stays armed but doesn't fire.
          #
          # Idempotent: re-arming on an already-armed PR is a no-op.
          REQ=$(jq -n '{Do:"merge", merge_when_checks_succeed:true}')
          BODY=$(mktemp)
          STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls/${PR_NUM}/merge")
          # Gitea returns:
          #   - 200/204 on successful immediate merge (gates already green AND approved)
          #   - 405 "Please try again later" when scheduled successfully but waiting
          #   - 422 on "Pull request is not mergeable" (conflict, stale base, etc.)
          #
          # 405 here is benign — Gitea's way of saying "scheduled, not merging now".
          # We treat 200/204/405 as success, anything else as failure.
          case "${STATUS}" in
            200|204)
              MERGE_OUTCOME="merged-immediately"
              echo "::notice::Promote PR #${PR_NUM} merged immediately (gates+approval already green)"
              ;;
            405)
              MERGE_OUTCOME="auto-merge-scheduled"
              echo "::notice::Promote PR #${PR_NUM}: auto-merge scheduled (Gitea will land on approval+green)"
              ;;
            422)
              MERGE_OUTCOME="not-mergeable"
              echo "::warning::Promote PR #${PR_NUM}: not mergeable (conflict, stale base, or already merging)."
              jq -r '.message // .' < "${BODY}" | head -c 500
              ;;
            *)
              echo "::error::Unexpected status ${STATUS} on merge schedule"
              jq -r '.message // .' < "${BODY}" | head -c 500
              rm -f "${BODY}"
              exit 1
              ;;
          esac
          rm -f "${BODY}"
          {
            echo "## Auto-promote PR opened"
            echo
            echo "- Source: staging at \`${TARGET_SHA:0:8}\`"
            echo "- PR: #${PR_NUM}"
            echo "- Outcome: \`${MERGE_OUTCOME}\`"
            echo
            if [ "${MERGE_OUTCOME}" = "auto-merge-scheduled" ]; then
              echo "Gitea will auto-merge once Hongming approves and all checks are green. No human action needed beyond approval."
            elif [ "${MERGE_OUTCOME}" = "merged-immediately" ]; then
              echo "Merged immediately. \`publish-workspace-server-image.yml\` will fire naturally on the resulting \`main\` push."
            else
              echo "PR is not auto-merging. Operator may need to bring staging up to date with main, then re-trigger this workflow via workflow_dispatch."
            fi
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/auto-promote-stale-alarm.yml
+++ b/.github/workflows/auto-promote-stale-alarm.yml
@ -1,83 +0,0 @@
 name: auto-promote-stale-alarm
 # Hourly cron + on-demand alarm for the silent-block failure mode that
 # motivated issue #2975:
 #   - The auto-promote-staging.yml workflow opened a PR + armed
 #     auto-merge, but main's branch protection requires a human review
 #     (reviewDecision=REVIEW_REQUIRED). The PR sat BLOCKED with no
 #     surface-up-the-stack for 12+ hours, holding 25 commits hostage
 #     including the Memory v2 redesign and a reno-stars data-loss fix.
 #
 # This workflow runs `scripts/check-stale-promote-pr.sh` against the
 # repo's open auto-promote PRs (base=main head=staging). When a PR has
 # been BLOCKED on REVIEW_REQUIRED for >4h, it:
 #   1. Emits a workflow-level warning (visible in run summary + the
 #      Actions UI feed).
 #   2. Posts a comment on the PR (idempotent — one alarm per PR).
 #
 # The detection logic lives in scripts/check-stale-promote-pr.sh so
 # it's unit-testable with stubbed `gh` (see test-check-stale-promote-pr.sh).
 # This file is the schedule + invocation surface only — SSOT for the
 # detector itself.
 on:
  schedule:
    # Hourly. Cheap (one `gh pr list` + jq), and 1h granularity is
    # plenty for a 4h staleness threshold — operators see the alarm
    # within at most 1h of crossing the threshold.
    - cron: "27 * * * *"  # at :27 to dodge the cron herd at :00
  workflow_dispatch:
    inputs:
      stale_hours:
        description: "Hours after which a BLOCKED+REVIEW_REQUIRED PR is stale (default 4)"
        required: false
        default: "4"
      post_comment:
        description: "Post a comment on stale PRs (default true)"
        required: false
        default: "true"
 permissions:
  contents: read
  pull-requests: write  # post comments on stale PRs
 # Serialize so the on-demand and scheduled runs don't double-comment
 # the same PR. cancel-in-progress=false because the script is idempotent
 # (existing comment marker prevents dupes), but a scheduled run firing
 # while a manual one runs would just re-list the same PR set.
 concurrency:
  group: auto-promote-stale-alarm
  cancel-in-progress: false
 jobs:
  scan:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout (need scripts/ only)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          sparse-checkout: |
            scripts/check-stale-promote-pr.sh
          sparse-checkout-cone-mode: false
      - name: Run stale-PR detector
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GITHUB_REPOSITORY: ${{ github.repository }}
          STALE_HOURS: ${{ inputs.stale_hours || '4' }}
          POST_COMMENT: ${{ inputs.post_comment || 'true' }}
        run: |
          # The script's exit code reflects the count of stale PRs.
          # We don't want a stale finding to fail the workflow run —
          # the warning + comment are the signal, the green/red is
          # noise. So convert any non-zero exit to a workflow notice
          # and exit 0.
          set +e
          bash scripts/check-stale-promote-pr.sh
          rc=$?
          set -e
          if [ "$rc" -ne 0 ]; then
            echo "::notice::Stale PR detector found $rc PR(s) needing attention. See warnings above + comments on the PRs."
          fi
          # Always succeed — operator-facing surface is the warning,
          # not the workflow status.
          exit 0
--- a/.github/workflows/auto-sync-canary.yml
+++ b/.github/workflows/auto-sync-canary.yml
@ -1,404 +0,0 @@
 name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift
 # Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by
 # auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml.
 #
 # ============================================================
 # Why this workflow exists
 # ============================================================
 #
 # PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which
 # 405s on Gitea's GraphQL endpoint — with a direct git push from the
 # `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review
 # weakest spot #3 of that PR:
 #
 #   "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is
 #    rotated without updating the repo secret, every push to main
 #    fails red on the auto-sync push step. The workflow surfaces the
 #    failure mode in the step summary (failure mode B in the header),
 #    but there's no proactive monitoring."
 #
 # Detection latency under the status quo: rotation is only caught on
 # the next push to `main`. During quiet periods (no main push for
 # many hours) the staging-superset-of-main invariant silently breaks.
 #
 # This workflow closes the gap: every 6 hours, it fires the auth
 # surface that auto-sync depends on and emits a red workflow status
 # if AUTO_SYNC_TOKEN has drifted out of validity.
 #
 # ============================================================
 # What this checks (Option B — read-only verify)
 # ============================================================
 #
 # 1. `GET /api/v1/user` against Gitea with the token → validates the
 #    token authenticates AND resolves to `devops-engineer` (catches
 #    the case where the token was regenerated under a different
 #    persona by mistake).
 # 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token →
 #    validates the token has `read:repository` scope on this repo
 #    (the v2 scope contract — see saved memory
 #    `reference_persona_token_v2_scope`).
 # 3. `git push --dry-run` of the current staging SHA back to
 #    `refs/heads/staging` via `https://oauth2:<token>@<gitea>/...`
 #    → validates the EXACT HTTPS basic-auth path that
 #    `actions/checkout` + `git push origin staging` use inside
 #    auto-sync-main-to-staging.yml. NOP by construction (push the
 #    current tip to itself = "Everything up-to-date"); auth is
 #    checked at the smart-protocol handshake BEFORE the empty-diff
 #    computation, so bad token → exit 128 with "Authentication
 #    failed". `git ls-remote` is NOT used here because Gitea
 #    falls back to anonymous read on public repos and would
 #    silently green-light a rotated token.
 #
 # Each step exits non-zero with an actionable error message if it
 # fails. The workflow status itself is the operator-facing surface.
 #
 # ============================================================
 # What this does NOT check (intentional)
 # ============================================================
 #
 # - **Branch-protection authz** (failure mode C in auto-sync header):
 #   would require an actual write to staging. Already monitored by
 #   `branch-protection-drift.yml` daily. Don't duplicate.
 # - **Conflict resolution** (failure mode A): a real conflict is data-
 #   driven, not auth-driven; can't synthesise it without polluting
 #   staging. Already surfaces immediately on the next main push.
 # - **Concurrency** (failure mode D): handled by workflow concurrency
 #   group on auto-sync, not a credential issue.
 #
 # ============================================================
 # Why Option B (read-only) and not the alternatives
 # ============================================================
 #
 # Considered + rejected (see issue #72 for full write-up):
 #
 # - **Option A — full auto-sync on schedule**: every run creates a
 #   no-op merge commit on staging when main hasn't advanced. 4 noise
 #   commits/day. And races the real `push:` trigger when main has
 #   advanced. Rejected.
 #
 # - **Option C — push to dedicated `auto-sync-canary` branch**: would
 #   exercise authz too, but adds branch noise on Gitea AND requires
 #   maintaining a second branch protection (or expanding staging's
 #   whitelist to a junk branch). Authz already covered by
 #   `branch-protection-drift.yml`. Rejected.
 #
 # Prior art for the chosen Option B shape:
 #   - Cloudflare's `/user/tokens/verify` endpoint (read-only auth
 #     probe explicitly designed for credential canaries).
 #   - AWS Secrets Manager rotation Lambda's `testSecret` step (auth
 #     probe before promoting AWSPENDING → AWSCURRENT).
 #   - HashiCorp Vault's `vault token lookup` for renewal canaries.
 #
 # ============================================================
 # Operator runbook — what to do when this workflow goes RED
 # ============================================================
 #
 # 1. **Identify which step failed**:
 #    - Step "Verify token authenticates as devops-engineer" red →
 #      token is invalid OR resolves to wrong persona.
 #    - Step "Verify token has repo read scope" red → token valid but
 #      stripped of `read:repository` scope (or repo perms changed).
 #    - Step "Verify git HTTPS auth path via no-op dry-run push to
 #      staging" red → token rotated/revoked OR Gitea git-HTTPS
 #      surface is broken (rare). Auth check happens on the
 #      smart-protocol handshake, separate from the API path.
 #
 # 2. **Re-issue the token** on the operator host:
 #    ```
 #    ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \
 #      gitea admin user generate-access-token \
 #      --username devops-engineer \
 #      --token-name persona-devops-engineer-vN \
 #      --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"'
 #    ```
 #    Update `/etc/molecule-bootstrap/agent-secrets.env` in place
 #    (per `feedback_unified_credentials_file`). The previous token
 #    file lands at `.bak.<date>`.
 #
 # 3. **Update the repo Actions secret** at:
 #    Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN
 #    Paste the new token. (Don't echo it in chat — but per
 #    `feedback_passwords_in_chat_are_burned`, a paste in a 1:1
 #    Claude session is within trust boundary.)
 #
 # 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN.
 #
 # 5. **Backfill any missed main → staging syncs** by re-running
 #    `auto-sync-main-to-staging.yml` from its workflow_dispatch
 #    surface, OR by pushing an empty commit to main (if you'd
 #    rather force a real trigger).
 #
 # ============================================================
 # Security notes
 # ============================================================
 #
 # - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`,
 #   `git ls-remote`). No write paths. Same blast-radius profile as
 #   `actions/checkout` on a public repo.
 # - The token NEVER appears in logs: every `curl` uses a header
 #   variable, never inline; the `git ls-remote` URL builds the
 #   `oauth2:$TOKEN@host` form into a single env var that's not
 #   echoed. GitHub Actions secret-masking covers anything that does
 #   slip through.
 # - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow
 #   under monitor uses. Per least-privilege we deliberately do NOT
 #   broaden scope for the canary.
 on:
  schedule:
    # Every 6 hours at :17 (offsets the cron herd at :00). Justification
    # from issue #72: cheap to run (~5s wall-clock, no quota), 3h average
    # detection latency, 6h max. 1h would be 24× the runs for marginal
    # benefit; daily would be 6× longer latency and worse than status
    # quo on a quiet-main day.
    - cron: '17 */6 * * *'
  workflow_dispatch:
 # No concurrency group needed — the canary is read-only and idempotent.
 # Two parallel runs (e.g. operator dispatch during a scheduled tick) are
 # harmless: same result, doubled HTTPS calls, no shared state.
 permissions:
  contents: read
 jobs:
  verify-token:
    name: Verify AUTO_SYNC_TOKEN validity
    runs-on: ubuntu-latest
    # 2 min surfaces hangs (Gitea API stall, DNS issue) within one
    # cron interval. Realistic worst case is ~10s: 2 curls + 1 git
    # ls-remote, each capped by the explicit timeouts below.
    timeout-minutes: 2
    env:
      # Pinned in env so individual steps can read it without
      # repeating the secret reference. GitHub masks the value in
      # logs automatically.
      AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
      # MUST stay in sync with auto-sync-main-to-staging.yml's
      # `git config user.name "devops-engineer"` line. Renaming the
      # devops-engineer persona requires updating both files (and
      # the staging branch protection's `push_whitelist_usernames`).
      EXPECTED_PERSONA: devops-engineer
      GITEA_HOST: git.moleculesai.app
      REPO_PATH: molecule-ai/molecule-core
    steps:
      - name: Verify AUTO_SYNC_TOKEN secret is configured
        # Schedule-vs-dispatch behaviour split, per
        # `feedback_schedule_vs_dispatch_secrets_hardening`:
        #
        #   - schedule: hard-fail when the secret is missing. The
        #     whole point of the canary is to surface drift; soft-
        #     skipping on missing-secret would make the canary
        #     itself drift-invisible (sweep-cf-orphans #2088 lesson).
        #   - workflow_dispatch: hard-fail too — there's no scenario
        #     where an operator wants this canary to silently no-op.
        #     The workflow has no other ad-hoc utility; if you ran
        #     it, you wanted the answer.
        run: |
          if [ -z "${AUTO_SYNC_TOKEN}" ]; then
            echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2
            echo "::error::Set it at Settings → Secrets and variables → Actions." >&2
            echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2
            exit 1
          fi
          echo "AUTO_SYNC_TOKEN is configured (value masked)."
      - name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }}
        # Calls Gitea's `/api/v1/user` — the canonical
        # auth-probe-with-no-side-effects endpoint (mirrors
        # Cloudflare's /user/tokens/verify).
        #
        # Failure surfaces:
        #   - HTTP 401: token invalid (rotated, revoked, or never
        #     correctly registered).
        #   - HTTP 200 but username != devops-engineer: token was
        #     regenerated under the wrong persona — this would let
        #     auth pass but commit attribution would be wrong, and
        #     branch-protection authz would fail because only
        #     `devops-engineer` is whitelisted.
        run: |
          set -euo pipefail
          response_file="$(mktemp)"
          code_file="$(mktemp)"
          # `--max-time 30`: full call ceiling. `--connect-timeout 10`:
          # DNS + TCP. `-w "%{http_code}"` routed to a tempfile so curl's
          # exit code can't pollute the captured status — see
          # feedback_curl_status_capture_pollution + the
          # `lint-curl-status-capture.yml` gate that rejects the unsafe
          # `$(curl ... || echo "000")` shape.
          set +e
          curl -sS -o "$response_file" \
            --max-time 30 --connect-timeout 10 \
            -w "%{http_code}" \
            -H "Authorization: token ${AUTO_SYNC_TOKEN}" \
            -H "Accept: application/json" \
            "https://${GITEA_HOST}/api/v1/user" >"$code_file" 2>/dev/null
          set -e
          status=$(cat "$code_file" 2>/dev/null || true)
          [ -z "$status" ] && status="000"
          if [ "$status" != "200" ]; then
            echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2
            echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2
            echo "::error::Runbook: see header comment of this workflow file." >&2
            # Print response body but redact anything that looks like a token.
            sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
            exit 1
          fi
          username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file")
          if [ "$username" != "${EXPECTED_PERSONA}" ]; then
            echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2
            echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2
            echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2
            exit 1
          fi
          echo "Token authenticates as: $username ✓"
      - name: Verify token has repo read scope
        # `GET /api/v1/repos/<owner>/<repo>` requires `read:repository`
        # on the persona's v2 scope contract. If the scope was
        # narrowed/dropped on rotation we catch it here, before the
        # next main push reveals it via a checkout failure.
        run: |
          set -euo pipefail
          response_file="$(mktemp)"
          code_file="$(mktemp)"
          # See first probe step for the rationale on the tempfile-routed
          # `-w "%{http_code}"` pattern — the unsafe `|| echo "000"` shape
          # is rejected by lint-curl-status-capture.yml.
          set +e
          curl -sS -o "$response_file" \
            --max-time 30 --connect-timeout 10 \
            -w "%{http_code}" \
            -H "Authorization: token ${AUTO_SYNC_TOKEN}" \
            -H "Accept: application/json" \
            "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" >"$code_file" 2>/dev/null
          set -e
          status=$(cat "$code_file" 2>/dev/null || true)
          [ -z "$status" ] && status="000"
          if [ "$status" != "200" ]; then
            echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2
            echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2
            echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2
            sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
            exit 1
          fi
          echo "Token has read:repository on ${REPO_PATH} ✓"
      - name: Verify git HTTPS auth path via no-op dry-run push to staging
        # Final probe: exercise the EXACT auth path that
        # `actions/checkout` + `git push origin staging` use in
        # auto-sync-main-to-staging.yml. Gitea's API and git-HTTPS
        # surfaces share the token-lookup code path internally but
        # the wire-level error shapes differ — historically (#173)
        # the API path was healthy while git-HTTPS rejected, so
        # checking only the API would have given false-green.
        #
        # IMPORTANT: `git ls-remote` on a public repo (which
        # molecule-core is) succeeds even with a junk token because
        # Gitea falls back to anonymous-read. `ls-remote` therefore
        # CANNOT validate auth on this surface. We use
        # `git push --dry-run` instead — push is auth-gated even on
        # public repos.
        #
        # NOP shape: read the current staging SHA via authenticated
        # ls-remote (the SHA itself is public; auth is incidental
        # here, used only to colocate the discovery in one step), then
        # `git push --dry-run <SHA>:refs/heads/staging`. Pushing the
        # current tip back to itself is "Everything up-to-date" with
        # exit 0 when auth succeeds. With a bad token Gitea returns
        # HTTP 401 in the smart-protocol handshake and git exits 128
        # with "Authentication failed".
        #
        # The dry-run never reaches Gitea's pre-receive hook (which
        # is where branch-protection authz runs), so this probe does
        # not validate failure mode C. That's intentional —
        # branch-protection-drift.yml owns authz monitoring; this
        # canary owns auth.
        env:
          # Don't hang waiting for password prompt if auth fails on a
          # terminal-attached run. (In Actions there's no terminal,
          # but the env-var hardens against an interactive runner
          # config.)
          GIT_TERMINAL_PROMPT: "0"
        run: |
          set -euo pipefail
          # Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the
          # URL as a local var that's never echoed.
          url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}"
          # Step a: read current staging SHA. ~1KB; auth-gated only
          # on private repos but always works on public — used here
          # only to discover the SHA, not to validate auth.
          staging_ref=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1) || {
            redacted=$(echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
            echo "::error::ls-remote against staging failed (network/DNS issue):" >&2
            echo "$redacted" >&2
            exit 1
          }
          if ! echo "$staging_ref" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then
            echo "::error::ls-remote returned unexpected shape:" >&2
            echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g" >&2
            exit 1
          fi
          staging_sha=$(echo "$staging_ref" | awk '{print $1}')
          # Step b: spin up an ephemeral local repo. `git push` always
          # requires a local repo even when pushing a remote SHA that
          # isn't in the local object DB (the protocol negotiates and
          # discovers we don't need to send any objects). We don't use
          # `actions/checkout` for this — it would clone the whole
          # repo (~hundreds of MB) for what's essentially `git init`.
          tmp_repo="$(mktemp -d)"
          trap 'rm -rf "$tmp_repo"' EXIT
          git -C "$tmp_repo" init -q
          # Author config required for any git operation; values are
          # arbitrary because nothing gets committed here.
          git -C "$tmp_repo" config user.email canary@auto-sync.local
          git -C "$tmp_repo" config user.name auto-sync-canary
          # Step c: dry-run push the current staging SHA back to
          # staging. NOP by construction — the remote tip equals the
          # SHA we're pushing, so "Everything up-to-date" is the
          # success path.
          #
          # Authentication is checked at the smart-protocol handshake,
          # BEFORE the dry-run can compute an empty diff. Bad token
          # → "Authentication failed", exit 128. Good token → exit 0.
          set +e
          push_out=$(timeout 30s git -C "$tmp_repo" push --dry-run "$url" "${staging_sha}:refs/heads/staging" 2>&1)
          push_rc=$?
          set -e
          if [ "$push_rc" -ne 0 ]; then
            redacted=$(echo "$push_out" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
            echo "::error::Token rotation suspected: git push --dry-run against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path (exit $push_rc)." >&2
            echo "::error::This is the EXACT auth path that actions/checkout + git push use in auto-sync-main-to-staging.yml." >&2
            echo "::error::Likely cause: AUTO_SYNC_TOKEN was rotated/revoked on Gitea but the repo Actions secret was not updated. Runbook: see header." >&2
            echo "$redacted" >&2
            exit 1
          fi
          echo "git HTTPS auth path: NOP push --dry-run to staging → ${staging_sha:0:8} ✓"
      - name: Summarise canary result
        # Everything passed — surface a green summary. (Failures
        # already wrote ::error:: lines and exited above; if we got
        # here, all three probes passed.)
        run: |
          {
            echo "## Auto-sync canary: GREEN"
            echo ""
            echo "AUTO_SYNC_TOKEN is healthy:"
            echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓"
            echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓"
            echo "- Git HTTPS auth path: no-op dry-run push to \`refs/heads/staging\` succeeds ✓"
            echo ""
            echo "Auto-sync main → staging will succeed on the next push to main."
            echo "If this canary ever goes RED, see the runbook in this workflow's header."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/auto-sync-main-to-staging.yml
+++ b/.github/workflows/auto-sync-main-to-staging.yml
@ -1,255 +0,0 @@
 name: Auto-sync main → staging
 # Reflects every push to `main` back onto `staging` so the
 # staging-as-superset-of-main invariant holds.
 #
 # ============================================================
 # What this workflow does
 # ============================================================
 #
 # On every push to `main`:
 #   1. Checks if staging already contains main → no-op.
 #   2. Fetches both branches, merges main into staging in the
 #      runner workspace (fast-forward if possible, else
 #      `--no-ff` merge commit).
 #   3. Pushes staging directly to origin via the
 #      `devops-engineer` persona's `AUTO_SYNC_TOKEN`.
 #
 # Authoritative path: a single `git push origin staging` from
 # inside this workflow is the SSOT for advancing staging after
 # a main push. No PR, no merge queue, no human approval —
 # staging is mechanically maintained as a superset of main.
 #
 # `auto-promote-staging.yml` is the reverse-direction
 # counterpart (staging → main, gated on green CI). Together
 # they keep the staging-superset-of-main invariant tight.
 #
 # ============================================================
 # Why direct push (and not "open a PR")
 # ============================================================
 #
 # Pre-2026-05-06 the canonical SCM was GitHub.com, where:
 #   - The `staging` branch had a `merge_queue` ruleset that
 #     blocked ALL direct pushes (no bypass even for org
 #     admins or the GitHub Actions integration).
 #   - Therefore this workflow opened a PR via `gh pr create`
 #     and let auto-merge land it through the queue.
 #
 # Post-2026-05-06 the canonical SCM is Gitea
 # (`git.moleculesai.app/molecule-ai/molecule-core`). Gitea:
 #   - Has no `merge_queue` concept.
 #   - Allows direct push to protected branches via per-user
 #     `push_whitelist_usernames` on the branch protection.
 #   - Does not expose a GraphQL endpoint, so `gh pr create`
 #     returns `HTTP 405 Method Not Allowed
 #     (https://git.moleculesai.app/api/graphql)` — the
 #     pre-suspension architecture cannot work on Gitea.
 #
 # The molecule-ai/molecule-core staging branch protection
 # (verified via `GET /api/v1/repos/.../branch_protections`)
 # whitelists `devops-engineer` for direct push. So the
 # correct Gitea-shape architecture is: authenticate as
 # `devops-engineer`, merge locally, push staging directly.
 #
 # This is structurally simpler than the GitHub-era PR dance
 # and removes the dependence on `gh` CLI / GraphQL entirely.
 #
 # ============================================================
 # Identity + token (anti-bot-ring per saved-memory
 # `feedback_per_agent_gitea_identity_default`)
 # ============================================================
 #
 # This workflow uses `secrets.AUTO_SYNC_TOKEN`, which is a
 # personal access token issued to the `devops-engineer`
 # persona on Gitea — NOT the founder PAT. The bot-ring
 # fingerprint that triggered the GitHub org suspension on
 # 2026-05-06 was characterised by founder PAT acting as CI
 # at machine speed; per-persona identities split the
 # attribution honestly.
 #
 # Token scope on Gitea: repo write. Push target restricted
 # to `staging` (this workflow is the only writer; main is
 # untouched). Compromise blast radius: bounded to staging
 # branch + this repo's read surface.
 #
 # Commits are authored by the persona email
 # `devops-engineer@agents.moleculesai.app` so commit history
 # reflects which automation produced the merge.
 #
 # ============================================================
 # Failure modes & operational notes
 # ============================================================
 #
 # A — staging has commits main doesn't, and the merge
 #     conflicts:
 #     - The `--no-ff` merge step exits non-zero. Workflow
 #       fails red. Operator (devops-engineer or human)
 #       resolves manually:
 #         git fetch origin
 #         git checkout staging
 #         git merge --no-ff origin/main
 #         # resolve conflicts
 #         git push origin staging
 #     - Step summary surfaces the conflict so the failed run
 #       is self-explanatory.
 #
 # B — `AUTO_SYNC_TOKEN` rotated / wrong scope:
 #     - `git push` step exits non-zero with `HTTP 401` /
 #       `403`. Step summary surfaces the failed push.
 #     - Re-issue the token from `~/.molecule-ai/personas/`
 #       on the operator host and update the repo Actions
 #       secret. Re-run the workflow.
 #
 # C — staging branch protection no longer whitelists
 #     `devops-engineer`:
 #     - `git push` exits non-zero with a Gitea protected-
 #       branch rejection. Step summary surfaces it.
 #     - Re-add `devops-engineer` to
 #       `push_whitelist_usernames` on the staging
 #       protection (Settings → Branches → staging).
 #
 # D — concurrent push to main while a sync is in flight:
 #     - The `concurrency` group below serialises runs.
 #       The second waits for the first; if main advances
 #       again while we're syncing, the second run picks
 #       up the new tip on its own fetch.
 #
 # ============================================================
 # Loop safety
 # ============================================================
 #
 # The push to staging from this workflow does NOT itself
 # fire a `push: branches: [main]` event (different branch),
 # so there's no risk of self-recursion. `auto-promote-staging.yml`
 # fires on `workflow_run` of CI etc. — it sees the new
 # staging tip on its next gate-completion event, NOT on this
 # push directly. No loop.
 on:
  push:
    branches: [main]
  # workflow_dispatch lets operators manually backfill a
  # missed sync (e.g. if AUTO_SYNC_TOKEN was rotated and a
  # main push slipped through while the secret was stale).
  workflow_dispatch:
 permissions:
  contents: write
 concurrency:
  group: auto-sync-main-to-staging
  cancel-in-progress: false
 jobs:
  sync-staging:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout staging (with devops-engineer push token)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          ref: staging
          # AUTO_SYNC_TOKEN authenticates as the
          # `devops-engineer` Gitea persona — the only
          # identity whitelisted for direct push to
          # staging. See header comment for context.
          token: ${{ secrets.AUTO_SYNC_TOKEN }}
      - name: Configure git author
        run: |
          # Per-persona identity, NOT founder PAT.
          # `feedback_per_agent_gitea_identity_default`.
          git config user.name "devops-engineer"
          git config user.email "devops-engineer@agents.moleculesai.app"
      - name: Check if staging already contains main
        id: check
        run: |
          set -euo pipefail
          git fetch origin main
          if git merge-base --is-ancestor origin/main HEAD; then
            echo "needs_sync=false" >> "$GITHUB_OUTPUT"
            {
              echo "## No-op"
              echo
              echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))."
            } >> "$GITHUB_STEP_SUMMARY"
          else
            echo "needs_sync=true" >> "$GITHUB_OUTPUT"
            MAIN_SHORT=$(git rev-parse --short=8 origin/main)
            echo "main_short=${MAIN_SHORT}" >> "$GITHUB_OUTPUT"
            echo "::notice::staging is missing main's tip (${MAIN_SHORT}) — merging in-runner and pushing"
          fi
      - name: Merge main into staging (in-runner)
        if: steps.check.outputs.needs_sync == 'true'
        id: merge
        run: |
          set -euo pipefail
          # Already on staging from checkout. Try fast-forward
          # first (cleanest history); fall back to merge commit
          # if staging has commits main doesn't.
          if git merge --ff-only origin/main; then
            echo "did_ff=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Fast-forwarded staging to origin/main"
          else
            echo "did_ff=false" >> "$GITHUB_OUTPUT"
            if ! git merge --no-ff origin/main \
                -m "chore: sync main → staging (auto, ${{ steps.check.outputs.main_short }})"; then
              # Hygiene: leave the work tree clean before failing.
              git merge --abort || true
              {
                echo "## Conflict"
                echo
                echo "Auto-merge \`main → staging\` failed with conflicts."
                echo "A human (or devops-engineer persona) needs to resolve manually:"
                echo
                echo '```'
                echo "git fetch origin"
                echo "git checkout staging"
                echo "git merge --no-ff origin/main"
                echo "# resolve conflicts"
                echo "git push origin staging"
                echo '```'
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
            fi
          fi
      - name: Push staging to origin
        if: steps.check.outputs.needs_sync == 'true'
        run: |
          set -euo pipefail
          # Direct push to staging. devops-engineer persona is
          # whitelisted for direct push on the staging branch
          # protection (Settings → Branches → staging).
          #
          # No --force / --force-with-lease: a fast-forward or
          # legitimate merge commit on top of current staging
          # is the only thing we'd ever push. If origin/staging
          # advanced under us (concurrent merge), the push
          # legitimately rejects and the next run picks up the
          # new state.
          if ! git push origin staging; then
            {
              echo "## Push rejected"
              echo
              echo "Direct push to \`staging\` failed. Likely causes:"
              echo "- \`AUTO_SYNC_TOKEN\` rotated / wrong scope (HTTP 401/403)"
              echo "- \`devops-engineer\` no longer in"
              echo "  \`push_whitelist_usernames\` on the staging"
              echo "  branch protection (HTTP 422)"
              echo "- staging advanced concurrently — re-running this"
              echo "  workflow on the new main tip will pick it up"
            } >> "$GITHUB_STEP_SUMMARY"
            exit 1
          fi
          {
            echo "## Auto-sync succeeded"
            echo
            echo "- staging advanced to: \`$(git rev-parse --short=8 HEAD)\`"
            echo "- main tip: \`${{ steps.check.outputs.main_short }}\`"
            echo "- Strategy: $([ "${{ steps.merge.outputs.did_ff }}" = "true" ] && echo "fast-forward" || echo "merge commit")"
            echo "- Pushed by: \`devops-engineer\` (per-agent persona, anti-bot-ring)"
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/auto-tag-runtime.yml
+++ b/.github/workflows/auto-tag-runtime.yml
@ -1,138 +0,0 @@
 name: auto-tag-runtime
 # Auto-tag runtime releases on every merge to main that touches workspace/.
 # This is the entry point of the runtime CD chain:
 #
 #   merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template
 #   image rebuilds → repull on hosts.
 #
 # Default bump is patch. Override via PR label `release:minor` or
 # `release:major` BEFORE merging — the label is read off the merged PR
 # associated with the push commit.
 #
 # Skips when:
 #   - The push isn't to main (other branches don't auto-release).
 #   - The merge commit message contains `[skip-release]` (escape hatch
 #     for cleanup PRs that touch workspace/ but shouldn't ship).
 on:
  push:
    branches: [main]
    paths:
      - "workspace/**"
      - "scripts/build_runtime_package.py"
      - ".github/workflows/auto-tag-runtime.yml"
      - ".github/workflows/publish-runtime.yml"
 permissions:
  contents: write    # to push the new tag
  pull-requests: read # to read labels off the merged PR
 concurrency:
  # Serialize tag bumps so two near-simultaneous merges can't both think
  # they're 0.1.6 and race to push the same tag.
  group: auto-tag-runtime
  cancel-in-progress: false
 jobs:
  tag:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0    # need full tag history for `git describe` / sort
      - name: Skip when commit asks
        id: skip
        run: |
          MSG=$(git log -1 --format=%B "${{ github.sha }}")
          if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then
            echo "skip=true" >> "$GITHUB_OUTPUT"
            echo "Commit message contains [skip-release] — no tag will be created."
          else
            echo "skip=false" >> "$GITHUB_OUTPUT"
          fi
      - name: Determine bump kind from PR label
        id: bump
        if: steps.skip.outputs.skip != 'true'
        env:
          # Gitea-shape token (act_runner forwards GITHUB_TOKEN as a
          # short-lived per-run secret with read access to this repo).
          # We hit `/api/v1/repos/.../pulls?state=closed` directly
          # because `gh pr list` calls Gitea's GraphQL endpoint, which
          # returns HTTP 405 (issue #75 / post-#66 sweep).
          GITEA_TOKEN: ${{ github.token }}
          REPO: ${{ github.repository }}
          GITEA_API_URL: ${{ github.server_url }}/api/v1
          PUSH_SHA: ${{ github.sha }}
        run: |
          # Find the merged PR whose merge_commit_sha matches this push.
          # Gitea's `/repos/{owner}/{repo}/pulls?state=closed` returns
          # PRs sorted newest-first; we paginate up to 50 and jq-filter
          # on `merge_commit_sha == PUSH_SHA`. Bounded — auto-tag fires
          # per push to main, so the matching PR is always among the
          # most recent closures. 50 is comfortably more than the
          # ~10-20 staging→main promotes that close in any reasonable
          # window.
          set -euo pipefail
          PRS_JSON=$(curl --fail-with-body -sS \
            -H "Authorization: token ${GITEA_TOKEN}" \
            -H "Accept: application/json" \
            "${GITEA_API_URL}/repos/${REPO}/pulls?state=closed&sort=newest&limit=50" \
            2>/dev/null || echo "[]")
          PR=$(printf '%s' "$PRS_JSON" \
            | jq -c --arg sha "$PUSH_SHA" \
                '[.[] | select(.merged_at != null and .merge_commit_sha == $sha)] | .[0] // empty')
          if [ -z "$PR" ] || [ "$PR" = "null" ]; then
            echo "No merged PR found for ${PUSH_SHA} — defaulting to patch bump."
            echo "kind=patch" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Gitea returns labels under `.labels[].name`, same shape as
          # GitHub's REST. The previous `gh pr list --json number,labels`
          # output was identical; jq filter unchanged.
          LABELS=$(printf '%s' "$PR" | jq -r '.labels[]?.name // empty')
          if echo "$LABELS" | grep -qx 'release:major'; then
            echo "kind=major" >> "$GITHUB_OUTPUT"
          elif echo "$LABELS" | grep -qx 'release:minor'; then
            echo "kind=minor" >> "$GITHUB_OUTPUT"
          else
            echo "kind=patch" >> "$GITHUB_OUTPUT"
          fi
      - name: Compute next version from latest runtime-v* tag
        id: version
        if: steps.skip.outputs.skip != 'true'
        run: |
          # Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver
          # ordering; `grep` filters to the right tag prefix.
          LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1)
          if [ -z "$LATEST" ]; then
            # No prior tag — start the runtime line at 0.1.0.
            CURRENT="0.0.0"
          else
            CURRENT="${LATEST#runtime-v}"
          fi
          MAJOR=$(echo "$CURRENT" | cut -d. -f1)
          MINOR=$(echo "$CURRENT" | cut -d. -f2)
          PATCH=$(echo "$CURRENT" | cut -d. -f3)
          case "${{ steps.bump.outputs.kind }}" in
            major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;;
            minor) MINOR=$((MINOR+1)); PATCH=0;;
            patch) PATCH=$((PATCH+1));;
          esac
          NEW="$MAJOR.$MINOR.$PATCH"
          echo "current=$CURRENT" >> "$GITHUB_OUTPUT"
          echo "new=$NEW" >> "$GITHUB_OUTPUT"
          echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})"
      - name: Push new tag
        if: steps.skip.outputs.skip != 'true'
        run: |
          NEW_TAG="runtime-v${{ steps.version.outputs.new }}"
          git config user.name "github-actions[bot]"
          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
          git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})"
          git push origin "$NEW_TAG"
          echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag."
--- a/.github/workflows/branch-protection-drift.yml
+++ b/.github/workflows/branch-protection-drift.yml
@ -1,111 +0,0 @@
 name: branch-protection drift check
 # Catches out-of-band edits to branch protection (UI clicks, manual gh
 # api PATCH from a one-off ops session) by comparing live state against
 # tools/branch-protection/apply.sh's desired state every day. Fails the
 # workflow when they drift; the failure is the signal.
 #
 # When it fails: re-run apply.sh to put the live state back to the
 # script's intent, OR update apply.sh to encode the new intent and
 # commit. Either way the script is the source of truth.
 on:
  schedule:
    # 14:00 UTC daily. Off-hours for most teams; gives a fresh signal
    # at the start of every working day.
    - cron: '0 14 * * *'
  workflow_dispatch:
  pull_request:
    branches: [staging, main]
    paths:
      - 'tools/branch-protection/**'
      - '.github/workflows/**'
      - '.github/workflows/branch-protection-drift.yml'
 permissions:
  contents: read
 jobs:
  drift:
    name: Branch protection drift
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # Token strategy by trigger:
      #
      # - schedule (daily canary): hard-fail when the admin token is
      #   missing. This is the *only* trigger where silent soft-skip is
      #   dangerous — a missing secret on the cron run means the drift
      #   gate has effectively disappeared with no human in the loop to
      #   notice. Per feedback_schedule_vs_dispatch_secrets_hardening.md
      #   the rule is "schedule/automated triggers must hard-fail".
      #
      # - pull_request (touching tools/branch-protection/**): soft-skip
      #   with a prominent warning. A PR cannot retroactively drift the
      #   live state — drift happens *between* PRs (UI clicks, manual
      #   gh api PATCH) and is the schedule's job to catch. The PR-time
      #   gate would only catch typos in apply.sh, which the apply.sh
      #   *_payload unit tests catch better. A human is reviewing the
      #   PR and will see the warning in the workflow log.
      #
      # - workflow_dispatch (operator one-off): soft-skip with warning,
      #   so an operator can run a diagnostic without configuring the
      #   secret first.
      - name: Verify admin token present (hard-fail on schedule only)
        env:
          GH_TOKEN_FOR_ADMIN_API: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
        run: |
          if [[ -n "$GH_TOKEN_FOR_ADMIN_API" ]]; then
            echo "GH_TOKEN_FOR_ADMIN_API present — drift_check will run with admin scope."
            exit 0
          fi
          if [[ "${{ github.event_name }}" == "schedule" ]]; then
            echo "::error::GH_TOKEN_FOR_ADMIN_API secret missing on the daily canary." >&2
            echo "" >&2
            echo "The schedule run is the SoT for branch-protection drift detection." >&2
            echo "Without admin scope it silently passes, hiding any out-of-band edits." >&2
            echo "Set GH_TOKEN_FOR_ADMIN_API at Settings → Secrets and variables → Actions." >&2
            exit 1
          fi
          echo "::warning::GH_TOKEN_FOR_ADMIN_API secret missing — drift_check will be SKIPPED."
          echo "::warning::PR drift checks need repo-admin scope to read /branches/:b/protection."
          echo "::warning::This is non-fatal: the daily schedule run is the canonical drift gate."
          echo "SKIP_DRIFT_CHECK=1" >> "$GITHUB_ENV"
      - name: Run drift check
        if: env.SKIP_DRIFT_CHECK != '1'
        env:
          # Repo-admin scope, needed for /branches/:b/protection.
          GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
        run: bash tools/branch-protection/drift_check.sh
      # Self-test the parity script before running it on the real
      # workflows — pins the script's classification logic against
      # synthetic safe/unsafe/missing/unsafe-mix/matrix fixtures so a
      # regression in the script can't false-pass on the production
      # workflow audit. Cheap (~0.5s); always runs.
      - name: Self-test check-name parity script
        run: bash tools/branch-protection/test_check_name_parity.sh
      # Check-name parity gate (#144 / saved memory
      # feedback_branch_protection_check_name_parity).
      #
      # drift_check.sh asserts the live branch protection matches what
      # apply.sh would set; check_name_parity.sh closes the orthogonal
      # gap: it asserts every required check name in apply.sh maps to a
      # workflow job whose "always emits this status" shape is intact.
      #
      # The two checks fail in different scenarios:
      #
      #   - drift_check fails → live state was rewritten out-of-band
      #     (UI click, manual PATCH).
      #   - check_name_parity fails → an apply.sh required name has no
      #     emitter, OR the emitting workflow has a top-level paths:
      #     filter without per-step if-gates (the silent-block shape).
      #
      # Cheap (~1s); runs without the admin token because it only reads
      # apply.sh + .github/workflows/ from the checkout.
      - name: Run check-name parity gate
        run: bash tools/branch-protection/check_name_parity.sh
--- a/.github/workflows/canary-staging.yml
+++ b/.github/workflows/canary-staging.yml
@ -20,6 +20,19 @@ on:
    # a few minutes under load — that's fine for a canary.
    - cron: '*/30 * * * *'
  workflow_dispatch:
    inputs:
      keep_on_failure:
        description: >-
          Skip teardown when the canary fails (debugging only). The
          tenant org + EC2 + CF tunnel + DNS stay alive so an operator
          can SSM into the workspace EC2 and capture docker logs of the
          failing claude-code container. REMEMBER to manually delete
          via DELETE /cp/admin/tenants/<slug> when done so the org
          doesn't accumulate cost. Only honored on workflow_dispatch;
          cron runs always tear down (we don't want unattended cron
          to leak resources).
        type: boolean
        default: false
 # Serialise with the full-SaaS workflow so they don't contend for the
 # same org-create quota on staging. Different group key from
@ -80,6 +93,14 @@ jobs:
      # is "Token Plan only" but cheap-per-token and fast.
      E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
      E2E_RUN_ID: "canary-${{ github.run_id }}"
      # Debug-only: when an operator dispatches with keep_on_failure=true,
      # the canary script's E2E_KEEP_ORG=1 path skips teardown so the
      # tenant org + EC2 stay alive for SSM-based log capture. Cron runs
      # never set this (the input only exists on workflow_dispatch) so
      # unattended cron always tears down. See molecule-core#129
      # failure mode #1 — capturing the actual exception requires
      # docker logs from the live container.
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@ -137,27 +158,28 @@ jobs:
        id: canary
        run: bash tests/e2e/test_staging_full_saas.sh
-      # Alerting: open an issue only after THREE consecutive failures so
+      # Alerting: open a sticky issue on the FIRST failure; comment on
-      # transient flakes (Cloudflare DNS hiccup, AWS API blip) don't spam
+      # subsequent failures; auto-close on next green. Comment-on-existing
-      # the issue list. If an issue is already open, we still comment on
+      # de-duplicates so a single open issue accumulates the streak —
-      # every failure so ops sees the streak. Auto-close on next green.
+      # ops sees one issue with N comments rather than N issues.
      #
-      # Threshold rationale: canary fires every 30 min, so 3 failures =
+      # Why no consecutive-failures threshold (e.g., wait 3 runs before
-      # ~90 min of consecutive red — well past any single-run flake but
+      # filing): the prior threshold check used
-      # still tight enough that a real outage gets surfaced before the
+      # `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does
-      # next deploy window.
+      # not expose (returns 404). On Gitea Actions the threshold call
      # ALWAYS failed, breaking the entire alerting step and going days
      # silent on real regressions (38h+ chronic red on 2026-05-07/08
      # before this fix; tracked in molecule-core#129). Filing on first
      # failure is also better UX — we want to know about the first red,
      # not wait 90 min for it to "count." Real flakes get one issue +
      # a quick close-on-green; persistent reds accumulate comments.
      - name: Open issue on failure
        if: failure()
        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
        env:
          # Inject the workflow path explicitly — context.workflow is
          # the *name*, not the file path the actions API needs.
          WORKFLOW_PATH: '.github/workflows/canary-staging.yml'
          CONSECUTIVE_THRESHOLD: '3'
        with:
          script: |
            const title = '🔴 Canary failing: staging SaaS smoke';
-            const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+            const runURL = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
            // Find an existing open canary issue (stable title match).
            // If one exists, this isn't a "first failure" — comment and exit.
@ -177,32 +199,12 @@ jobs:
              return;
            }
-            // No open issue yet — check the last N-1 runs' conclusions.
+            // No open issue yet — file one on this first failure. The
-            // We open the issue only if the last (THRESHOLD-1) runs ALSO
+            // comment-on-existing branch above means subsequent failures
-            // failed (so this is the 3rd consecutive red).
+            // accumulate as comments on this same issue, so we don't
-            const threshold = parseInt(process.env.CONSECUTIVE_THRESHOLD, 10);
+            // spam new issues per run.
            const { data: runs } = await github.rest.actions.listWorkflowRuns({
              owner: context.repo.owner, repo: context.repo.repo,
              workflow_id: process.env.WORKFLOW_PATH,
              status: 'completed',
              per_page: threshold,
              // Skip the current in-progress run; it isn't 'completed' yet.
            });
            // listWorkflowRuns returns recent first. We need (threshold-1)
            // prior failures (current run is the threshold-th).
            const priorFailures = (runs.workflow_runs || [])
              .slice(0, threshold - 1)
              .filter(r => r.id !== context.runId)
              .filter(r => r.conclusion === 'failure')
              .length;
            if (priorFailures < threshold - 1) {
              core.info(`Below threshold: ${priorFailures + 1}/${threshold} consecutive failures — not filing yet`);
              return;
            }
            const body =
-              `Canary run failed at ${new Date().toISOString()}, ` +
+              `Canary run failed at ${new Date().toISOString()}.\n\n` +
              `${threshold} consecutive runs red.\n\n` +
              `Run: ${runURL}\n\n` +
              `This issue auto-closes on the next green canary run. ` +
              `Consecutive failures add a comment here rather than a new issue.`;
@ -211,7 +213,7 @@ jobs:
              title, body,
              labels: ['canary-staging', 'bug'],
            });
-            core.info(`Opened canary failure issue (${threshold} consecutive reds)`);
+            core.info('Opened canary failure issue (first red)');
      - name: Auto-close canary issue on success
        if: success()
--- a/.github/workflows/canary-verify.yml
+++ b/.github/workflows/canary-verify.yml
@ -1,19 +1,34 @@
 name: canary-verify
 # Runs the canary smoke suite against the staging canary tenant fleet
-# after a new :staging-<sha> image lands in GHCR. On green, promotes
+# after a new :staging-<sha> image lands in ECR. On green, calls the
-# :staging-<sha> → :latest so the prod tenant fleet's 5-minute
+# CP redeploy-fleet endpoint to promote :staging-<sha> → :latest so
-# auto-updater picks up the verified digest. On red, :latest stays
+# the prod tenant fleet's 5-minute auto-updater picks up the verified
-# on the prior known-good digest and prod is untouched.
+# digest. On red, :latest stays on the prior known-good digest and
 # prod is untouched.
 #
 # Registry note (2026-05-10): This workflow previously used GHCR
 # (ghcr.io/molecule-ai/platform-tenant) — that registry was retired
 # during the 2026-05-06 Gitea suspension migration when publish-
 # workspace-server-image.yml switched to the operator's ECR org
 # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/
 # platform-tenant). The GHCR → ECR migration was never applied to
 # this file, so canary-verify was silently smoke-testing the stale
 # GHCR image while the actual staging/prod tenants ran the ECR image.
 # Result: smoke tests could not catch a broken ECR build. Fix:
 #   - Wait step: reads SHA from running canary /health (tenant-
 #     agnostic, works regardless of registry).
 #   - Promote step: calls CP redeploy-fleet endpoint with target_tag=
 #     staging-<sha>, same mechanism as redeploy-tenants-on-main.yml.
 #     No longer attempts GHCR crane ops.
 #
 # Dependencies:
 #   - publish-workspace-server-image.yml publishes :staging-<sha>
-#     (NOT :latest) on main merge
+#     to ECR on staging and main merges.
-#   - canary tenants are configured to pull :staging-<sha> as their
+#   - Canary tenants are configured to pull :staging-<sha> from ECR
-#     tenant image (set TENANT_IMAGE=ghcr.io/…:staging-<sha> on the
+#     (TENANT_IMAGE env set to the ECR :staging-<sha> tag).
 #     canary provisioner code path OR rotate via an admin endpoint)
 #   - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS /
-#     CANARY_CP_SHARED_SECRET are populated
+#     CANARY_CP_SHARED_SECRET are populated.
 on:
  workflow_run:
@ -27,8 +42,12 @@ permissions:
  actions: read
 env:
-  IMAGE_NAME: ghcr.io/molecule-ai/platform
+  # ECR registry (post-2026-05-06 SSOT for tenant images).
-  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
+  # publish-workspace-server-image.yml pushes here.
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
  # CP endpoint for redeploy-fleet (used in promote step below).
  CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
 jobs:
  canary-smoke:
@ -52,6 +71,12 @@ jobs:
        # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to
        # proceeding after 7 min even if not all canaries responded —
        # the smoke suite will catch any that didn't update.
        #
        # NOTE: The SHA is read from the running tenant's /health response,
        # NOT from a registry lookup. This is registry-agnostic and works
        # regardless of whether the tenant pulls from ECR, GHCR, or any
        # other registry — the canary is telling us what it's actually
        # running, which is the ground truth for smoke testing.
        env:
          CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
          EXPECTED_SHA: ${{ steps.compute.outputs.sha }}
@ -133,42 +158,98 @@ jobs:
          } >> "$GITHUB_STEP_SUMMARY"
  promote-to-latest:
-    # On green, retag :staging-<sha> → :latest for BOTH images.
+    # On green, calls the CP redeploy-fleet endpoint with target_tag=
-    # crane is a lightweight registry client (no Docker daemon needed on
+    # staging-<sha> to promote the verified ECR image. This is the same
-    # the runner) that can retag remotely with a single API call each.
+    # mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops.
-    # Gated on smoke_ran=true — without a real canary fleet the smoke
+    #
-    # step no-ops with success, and we don't want that to silently
+    # Pre-fix history: the old GHCR promote step used `crane tag` against
-    # auto-promote every main merge.
+    # ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server-
    # image.yml had already migrated to ECR on 2026-05-07 (commit
    # 10e510f5). The GHCR tags were never updated, so this step was
    # silently promoting a stale GHCR image while actual prod tenants
    # pulled from ECR. Canary smoke tests were GHCR-targeted and could
    # not catch a broken ECR build.
    needs: canary-smoke
    if: ${{ needs.canary-smoke.result == 'success' && needs.canary-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    env:
      SHA: ${{ needs.canary-smoke.outputs.sha }}
      CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
      # CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint.
      # Stored at the repo level so all workflows pick it up automatically.
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      # canary_slug pin: deploy the verified :staging-<sha> to the canary
      # first (soak 120s), then fan out to the rest of the fleet.
      CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }}
      SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }}
      BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }}
    steps:
-      - uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
+      - name: Check CP credentials
      - name: GHCR login
        run: |
-          echo "${{ secrets.GITHUB_TOKEN }}" | \
+          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
-            crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
+            echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet."
            echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret."
            exit 1
          fi
-      - name: Retag platform :staging-<sha> → :latest
+      - name: Promote verified ECR image to :latest
        run: |
-          crane tag \
+          set -euo pipefail
            "${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
            latest
-      - name: Retag tenant :staging-<sha> → :latest
+          TARGET_TAG="staging-${SHA}"
-        run: |
+          BODY=$(jq -nc \
-          crane tag \
+            --arg tag "$TARGET_TAG" \
-            "${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
+            --argjson soak "${SOAK_SECONDS:-120}" \
-            latest
+            --argjson batch "${BATCH_SIZE:-3}" \
            --argjson dry false \
            '{
              target_tag: $tag,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')
          if [ -n "${CANARY_SLUG:-}" ]; then
            BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY")
          fi
          echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  target_tag: $TARGET_TAG"
          echo "  body: $BODY"
          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          CURL_EXIT=$?
          set -e
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
          echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
          if [ "$HTTP_CODE" -ge 400 ]; then
            echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed."
            exit 1
          fi
      - name: Summary
        run: |
          {
-            echo "## Canary verified — :latest promoted"
+            echo "## Canary verified — :latest promoted via CP redeploy-fleet"
-            echo
+            echo ""
-            echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`"
+            echo "- **Target tag:** \`staging-${{ needs.canary-smoke.outputs.sha }}\`"
-            echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`"
+            echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)"
-            echo
+            echo "- **Canary slug:** \`${CANARY_SLUG:-<none>}\` (soak ${SOAK_SECONDS}s)"
-            echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle."
+            echo "- **Batch size:** ${BATCH_SIZE:-3}"
            echo ""
            echo "CP redeploy-fleet is rolling out the verified image across the prod fleet."
            echo "The fleet's 5-minute health-check loop will pick up the update automatically."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/check-merge-group-trigger.yml
+++ b/.github/workflows/check-merge-group-trigger.yml
@ -1,123 +0,0 @@
 name: Check merge_group trigger on required workflows
 # Pre-merge guard against the deadlock pattern where a workflow whose
 # check is in `required_status_checks` lacks a `merge_group:` trigger.
 # Without it, GitHub merge queue stalls forever in AWAITING_CHECKS
 # because the required check can't fire on `gh-readonly-queue/...` refs.
 #
 # This workflow:
 #   1. Lists required status checks on the branch protection rule for `staging`
 #   2. For each required check, finds the workflow that produces it (by job
 #      name match)
 #   3. Fails if any such workflow lacks `merge_group:` in its triggers
 #
 # Reasoning for staging-only: main has its own CI gating model (PR review),
 # but staging is what the merge queue runs on, so it's the trigger that
 # matters.
 on:
  pull_request:
    paths:
      - '.github/workflows/**.yml'
      - '.github/workflows/**.yaml'
  push:
    branches: [staging, main]
    paths:
      - '.github/workflows/**.yml'
      - '.github/workflows/**.yaml'
  # Self-listen on merge_group so the linter passes its own queue run.
  merge_group:
    types: [checks_requested]
 jobs:
  check:
    name: Required workflows have merge_group trigger
    runs-on: ubuntu-latest
    permissions:
      contents: read
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Verify merge_group trigger on required-check workflows
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
        shell: bash
        run: |
          set -euo pipefail
          # Branch we care about — the one merge queue runs on.
          BRANCH=staging
          # Pull the list of required status check contexts. If the branch
          # has no protection or no required checks, exit clean — nothing
          # to lint.
          REQUIRED=$(gh api "repos/${REPO}/branches/${BRANCH}/protection/required_status_checks" \
            --jq '.contexts[]' 2>/dev/null || true)
          if [ -z "$REQUIRED" ]; then
            echo "No required status checks on ${BRANCH} — nothing to verify."
            exit 0
          fi
          echo "Required checks on ${BRANCH}:"
          echo "${REQUIRED}" | sed 's/^/  - /'
          echo
          # Build a map: workflow file -> set of job names declared in it.
          # We use yq if available, otherwise grep the `name:` lines under
          # `jobs:`. Stick with grep for portability — runner image always
          # has it; yq isn't in the default image as of 2026-04.
          declare -A workflow_jobs
          shopt -s nullglob
          for wf in .github/workflows/*.yml .github/workflows/*.yaml; do
            [ -f "$wf" ] || continue
            # Extract the workflow name (the `name:` at file root).
            wf_name=$(awk '/^name:[[:space:]]/ {sub(/^name:[[:space:]]+/,""); gsub(/^"|"$/,""); print; exit}' "$wf")
            # Extract job step names from the `jobs:` block. A job step is:
            #   - id under `jobs:` (key with 2-space indent followed by colon)
            #   - the `name:` field inside that job (4-space indent)
            # We collect both because required_status_checks contexts can
            # match either, depending on how the workflow was authored.
            jobs_block=$(awk '/^jobs:/{flag=1; next} flag' "$wf")
            job_names=$(echo "$jobs_block" | awk '/^[[:space:]]{4}name:[[:space:]]/ {sub(/^[[:space:]]+name:[[:space:]]+/,""); gsub(/^["'"'"']|["'"'"']$/,""); print}')
            workflow_jobs["$wf"]="${wf_name}"$'\n'"${job_names}"
          done
          # For each required check, find the workflow that produces it.
          # Then verify that workflow lists merge_group as a trigger.
          FAILED=0
          while IFS= read -r check; do
            [ -z "$check" ] && continue
            owning_wf=""
            for wf in "${!workflow_jobs[@]}"; do
              if echo "${workflow_jobs[$wf]}" | grep -Fxq "$check"; then
                owning_wf="$wf"
                break
              fi
            done
            if [ -z "$owning_wf" ]; then
              echo "::warning::Required check '${check}' has no matching workflow in this repo. Skipping (may be from an external app)."
              continue
            fi
            # Does the workflow's trigger list include merge_group?
            # Match either bare `merge_group:` line or merge_group with
            # subsequent indented config (types: [checks_requested]).
            if grep -qE '^[[:space:]]*merge_group:' "$owning_wf"; then
              echo "OK: '${check}' (in $owning_wf) — has merge_group trigger"
            else
              echo "::error file=${owning_wf}::Required check '${check}' is produced by ${owning_wf}, but the workflow does not declare a 'merge_group:' trigger. With merge queue enabled on ${BRANCH}, this will deadlock the queue (every PR sits AWAITING_CHECKS forever). Add this to the workflow's 'on:' block:"
              echo "::error file=${owning_wf}::  merge_group:"
              echo "::error file=${owning_wf}::    types: [checks_requested]"
              FAILED=1
            fi
          done <<< "$REQUIRED"
          if [ "$FAILED" -ne 0 ]; then
            echo
            echo "::error::Block. See errors above. Reference: $(grep -l 'reference_merge_queue' /dev/null 2>/dev/null || echo 'memory: reference_merge_queue_enablement.md')."
            exit 1
          fi
          echo
          echo "All required workflows on ${BRANCH} declare merge_group triggers."
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -304,13 +304,9 @@ jobs:
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
    if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
    permissions:
      # Required to post commit comments via the GitHub API.
      contents: write
    steps:
-      - name: Post deploy reminder as commit comment
+      - name: Write deploy reminder to step summary
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          COMMIT_SHA: ${{ github.sha }}
          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
        run: |
@ -337,10 +333,13 @@ jobs:
          printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
            "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
-          gh api \
+          # Gitea has no commit-comments API (no equivalent of
-            --method POST \
+          # POST /repos/{owner}/{repo}/commits/{commit_sha}/comments).
-            "repos/${{ github.repository }}/commits/${{ github.sha }}/comments" \
+          # Write to GITHUB_STEP_SUMMARY instead — both GitHub Actions and
-            --field "body=@/tmp/deploy-reminder.md"
+          # Gitea Actions render this as the workflow run's summary page,
          # which is where operators look for post-deploy action items.
          # (#75 / PR-D)
          cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
  # Python Lint & Test — required check, always runs. See platform-build
  # for the rationale.
@ -366,7 +365,7 @@ jobs:
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - if: needs.changes.outputs.python == 'true'
-        run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov
+        run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0
      # Coverage flags + fail-under floor moved into workspace/pytest.ini
      # (issue #1817) so local `pytest` and CI use identical config.
      - if: needs.changes.outputs.python == 'true'
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -1,136 +0,0 @@
 name: CodeQL
 # Stub workflow — CodeQL Action is structurally incompatible with Gitea
 # Actions (post-2026-05-06 SCM migration off GitHub).
 #
 # Why this is a stub, not a real CodeQL run:
 #
 # 1. github/codeql-action/init@v4 hits api.github.com endpoints
 #    (CodeQL CLI bundle download + query-pack registry + telemetry)
 #    that Gitea 1.22.x does NOT proxy. The act_runner has
 #    GITHUB_SERVER_URL=https://git.moleculesai.app correctly set
 #    (per saved memory feedback_act_runner_github_server_url and
 #    /config.yaml on the operator host), but the Gitea API surface
 #    simply does not implement the codeql-action bundle endpoints.
 #    Observed in run 1d/3101 (2026-05-07): "::error::404 page not
 #    found" inside the Initialize CodeQL step, before any analysis.
 #
 # 2. PR #35 attempted to mark `continue-on-error: true` at the JOB
 #    level (correct YAML structure). Gitea 1.22.6 does NOT propagate
 #    job-level continue-on-error to the commit-status API — every
 #    matrix leg still posts `failure` to the status surface, which
 #    keeps OVERALL=failure on every push to main + staging and
 #    blocks visual auto-promote signals (#156).
 #
 # 3. Hongming policy decision (2026-05-07, task #156): CodeQL is
 #    ADVISORY, not blocking, on Gitea Actions. We do not block PR
 #    merge or staging→main promotion on CodeQL findings until we
 #    have a Gitea-compatible static-analysis pipeline.
 #
 # What this stub preserves:
 #
 # - Workflow name `CodeQL` (referenced by auto-promote-staging.yml
 #   line 67 as a workflow_run gate — must stay stable).
 # - Job name template `Analyze (${{ matrix.language }})` and the
 #   3-leg matrix (go, javascript-typescript, python). Branch
 #   protection / required-check parity (#144) keys on these
 #   exact context names.
 # - merge_group + push + pull_request + schedule triggers, so the
 #   merge-queue check name still resolves (per saved memory
 #   feedback_branch_protection_check_name_parity).
 #
 # Re-enabling real analysis (future work):
 #
 # - Option A: self-hosted Semgrep / OpenGrep via a custom action
 #   that doesn't hit api.github.com. Tracked behind #156 follow-up.
 # - Option B: Sonatype Nexus IQ or similar, called from a step
 #   that uses the Gitea-issued token only.
 # - Option C: re-host this workflow on a small GitHub mirror used
 #   ONLY for SAST (push-mirrored from Gitea). Acceptable trade-off
 #   if/when payment is restored on a non-suspended GitHub org —
 #   but per saved memory feedback_no_single_source_of_truth, we
 #   should design for multi-vendor backup, not GitHub-only SAST.
 #
 # Until one of those lands, this stub keeps commit-status green so
 # the auto-promote chain isn't permanently red on a tool we cannot
 # actually run.
 #
 # Security policy: ADVISORY. We accept the residual risk of un-scanned
 # pushes during this window. Compensating controls in place:
 #   - secret-scan.yml runs on every push (active, blocks on hits)
 #   - block-internal-paths.yml blocks forbidden file paths
 #   - lint-curl-status-capture.yml catches one specific class of bug
 #   - branch-protection-drift.yml + the merge_group required-checks
 #     parity keep the gate surface stable
 # These are not equivalent to CodeQL coverage. Status of the
 # replacement plan is tracked in #156.
 on:
  push:
    branches: [main, staging]
  pull_request:
    branches: [main, staging]
  # Required so the matrix legs emit a real result on the queued
  # commit instead of a false-green when merge queue is enabled.
  # Per saved memory feedback_branch_protection_check_name_parity:
  # path-filtered / matrix workflows MUST emit the protected name
  # via a job that always runs.
  merge_group:
    types: [checks_requested]
  schedule:
    # Weekly heartbeat. Cheap on a stub (the no-op job is ~5s) but
    # keeps the workflow visible in Gitea's Actions UI so the next
    # operator notices it's a stub instead of a missing surface.
    - cron: '30 1 * * 0'
 # Workflow-level concurrency: only one stub run per branch/PR at a
 # time. cancel-in-progress: false because a quick follow-up push
 # shouldn't kill an in-flight run — even though the stub is fast,
 # the contract should match a real CodeQL run for when we re-enable.
 concurrency:
  group: codeql-${{ github.ref }}
  cancel-in-progress: false
 permissions:
  actions: read
  contents: read
  # No security-events: write — we don't call the upload API anyway,
  # GHAS isn't on Gitea.
 jobs:
  analyze:
    # Job NAME shape is load-bearing — auto-promote-staging.yml +
    # branch protection both key on `Analyze (${{ matrix.language }})`.
    # Do NOT rename without coordinating both surfaces.
    name: Analyze (${{ matrix.language }})
    runs-on: ubuntu-latest
    timeout-minutes: 5
    strategy:
      fail-fast: false
      matrix:
        language: [go, javascript-typescript, python]
    steps:
      # Single-step stub: log the policy decision + emit success.
      # Exit 0 explicitly so the commit-status API records `success`
      # for each of the three matrix legs.
      - name: CodeQL stub (advisory, non-blocking on Gitea)
        shell: bash
        run: |
          set -euo pipefail
          cat <<EOF
          CodeQL is currently ADVISORY on Gitea Actions (post-2026-05-06).
          Language matrix leg: ${{ matrix.language }}
          Reason: github/codeql-action/init@v4 calls api.github.com
                  bundle endpoints that Gitea 1.22.x does not implement.
                  Observed: "::error::404 page not found" in the Init
                  CodeQL step on every prior run.
          Policy: per Hongming decision 2026-05-07 (#156), CodeQL is
                  non-blocking until a Gitea-compatible SAST pipeline
                  lands. See workflow file header for replacement
                  options + compensating controls.
          Status: emitting success so auto-promote isn't permanently
                  red on a tool we cannot actually run today.
          EOF
          echo "::notice::CodeQL ${{ matrix.language }} — advisory stub, success."
--- a/.github/workflows/e2e-api.yml
+++ b/.github/workflows/e2e-api.yml
@ -51,7 +51,7 @@ name: E2E API Smoke Test
 #   * Pre-pull `alpine:latest` so the platform-server's provisioner
 #     (`internal/handlers/container_files.go`) can stand up its
 #     ephemeral token-write helper without a daemon.io round-trip.
-#   * Create `molecule-monorepo-net` bridge network if missing so the
+#   * Create `molecule-core-net` bridge network if missing so the
 #     provisioner's container.HostConfig {NetworkMode: ...} attach
 #     succeeds.
 # Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
@ -163,12 +163,12 @@ jobs:
          # when the image is already present.
          docker pull alpine:latest >/dev/null
          # Provisioner attaches workspace containers to
-          # molecule-monorepo-net (workspace-server/internal/provisioner/
+          # molecule-core-net (workspace-server/internal/provisioner/
          # provisioner.go::DefaultNetwork). The bridge already exists on
          # the operator host's docker daemon — `network create` is
          # idempotent via `|| true`.
-          docker network create molecule-monorepo-net >/dev/null 2>&1 || true
+          docker network create molecule-core-net >/dev/null 2>&1 || true
-          echo "alpine:latest pre-pulled; molecule-monorepo-net ensured."
+          echo "alpine:latest pre-pulled; molecule-core-net ensured."
      - name: Start Postgres (docker)
        if: needs.detect-changes.outputs.api == 'true'
        run: |
--- a/.github/workflows/handlers-postgres-integration.yml
+++ b/.github/workflows/handlers-postgres-integration.yml
@ -34,7 +34,7 @@ name: Handlers Postgres Integration
 # So we sidestep `services:` entirely. The job container still uses
 # host-net (inherited from runner config; required for cache server
 # discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
-# postgres on the existing `molecule-monorepo-net` bridge with a
+# postgres on the existing `molecule-core-net` bridge with a
 # UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
 # read its bridge IP via `docker inspect`. A host-net job container
 # can reach a bridge-net container directly via the bridge IP (verified
@ -44,7 +44,7 @@ name: Handlers Postgres Integration
 #   + No host-port collision; N parallel runs share the bridge cleanly
 #   + `if: always()` cleanup runs even on test-step failure
 #   - One more step in the workflow (+~3 lines)
-#   - Requires `molecule-monorepo-net` to exist on the operator host
+#   - Requires `molecule-core-net` to exist on the operator host
 #     (it does; declared in docker-compose.yml + docker-compose.infra.yml)
 #
 # Class B Hongming-owned CICD red sweep, 2026-05-08.
@ -96,7 +96,7 @@ jobs:
      PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
      # Bridge network already exists on the operator host (declared
      # in docker-compose.yml + docker-compose.infra.yml).
-      PG_NETWORK: molecule-monorepo-net
+      PG_NETWORK: molecule-core-net
    defaults:
      run:
        working-directory: workspace-server
--- a/.github/workflows/harness-replays.yml
+++ b/.github/workflows/harness-replays.yml
@ -56,21 +56,40 @@ jobs:
      run: ${{ steps.decide.outputs.run }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
        id: filter
        with:
          filters: |
            run:
              - 'workspace-server/**'
              - 'canvas/**'
              - 'tests/harness/**'
              - '.github/workflows/harness-replays.yml'
      - id: decide
        run: |
          # workflow_dispatch: always run (manual trigger)
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=manual-trigger" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Determine the base commit to diff against.
          # For pull_request: use base.sha (the merge-base with main/staging).
          # For push: use github.event.before (the previous tip of the branch).
          # Fallback for new branches (all-zeros SHA): run everything.
          if [ "${{ github.event_name }}" = "pull_request" ] && \
             [ -n "${{ github.event.pull_request.base.sha }}" ]; then
            BASE="${{ github.event.pull_request.base.sha }}"
          elif [ -n "${{ github.event.before }}" ] && \
               ! echo "${{ github.event.before }}" | grep -qE '^0+$'; then
            BASE="${{ github.event.before }}"
          else
-            echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT"
+            # New branch or github.event.before unavailable — run everything.
            echo "run=true" >> "$GITHUB_OUTPUT"
            echo "debug=new-branch-fallback" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # GitHub Actions and Gitea Actions both expose github.sha for HEAD.
          DIFF=$(git diff --name-only "$BASE" "${{ github.sha }}" 2>/dev/null)
          echo "debug=diff-base=$BASE diff-files=$DIFF" >> "$GITHUB_OUTPUT"
          if echo "$DIFF" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.github/workflows/harness-replays\.yml$'; then
            echo "run=true" >> "$GITHUB_OUTPUT"
          else
            echo "run=false" >> "$GITHUB_OUTPUT"
          fi
  # ONE job that always runs. Real work is gated per-step on
@ -91,10 +110,17 @@ jobs:
        run: |
          echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
          echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
          echo "::notice::Debug: ${{ needs.detect-changes.outputs.debug }}"
      - if: needs.detect-changes.outputs.run == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # Log what files were detected so future failures include the diff.
      - name: Log detected changes
        if: needs.detect-changes.outputs.run == 'true'
        run: |
          echo "::notice::detect-changes debug: ${{ needs.detect-changes.outputs.debug }}"
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # the plugin was dropped + Dockerfile.tenant no longer COPYs it.
@ -119,6 +145,17 @@ jobs:
      # symptom, different root cause: staging still has the in-image
      # clone path, hits the auth error directly).
      #
      # 2026-05-08 sub-finding (#192): the clone step ALSO fails when
      # any referenced workspace-template repo is private and the
      # AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read
      # access. Root cause: 5 of 9 workspace-template repos
      # (openclaw, codex, crewai, deepagents, gemini-cli) had been
      # marked private with no team grant. Resolution: flipped them
      # to public per `feedback_oss_first_repo_visibility_default`
      # (the OSS surface should be public). Layer-3 (customer-private +
      # marketplace third-party repos) tracked separately in
      # internal#102.
      #
      # Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
      # is the devops-engineer persona PAT, NOT the founder PAT (per
      # `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
--- a/.github/workflows/pr-guards.yml
+++ b/.github/workflows/pr-guards.yml
@ -1,63 +0,0 @@
 name: pr-guards
 # PR-time guards. Today the only guard is "disable auto-merge when a
 # new commit is pushed after auto-merge was enabled" — added 2026-04-27
 # after PR #2174 auto-merged with only its first commit because the
 # second commit was pushed after the merge queue had locked the PR's
 # SHA.
 #
 # Why this is inlined (not delegated to molecule-ci's reusable
 # workflow): the reusable workflow uses `gh pr merge --disable-auto`,
 # which calls GitHub's GraphQL API. Gitea has no GraphQL endpoint and
 # returns HTTP 405 on /api/graphql, so the job failed on every Gitea
 # PR push since the 2026-05-06 migration. Gitea also has no `--auto`
 # merge primitive that this job could be acting on, so the right
 # behaviour on Gitea is "no-op + green status" — not a 405.
 #
 # Inlining (vs. an `if:` on the `uses:` line) keeps the job ALWAYS
 # running, which matters for branch protection: required-check names
 # need a job that emits SUCCESS terminal state, not SKIPPED. See
 # `feedback_branch_protection_check_name_parity` and `feedback_pr_merge_safety_guards`.
 #
 # Issue #88 item 1.
 on:
  pull_request:
    types: [synchronize]
 permissions:
  pull-requests: write
 jobs:
  disable-auto-merge-on-push:
    runs-on: ubuntu-latest
    steps:
      # Detect Gitea Actions. act_runner sets GITEA_ACTIONS=true in the
      # step env on every job. Belt-and-suspenders: also check the repo
      # url's host, which is independent of any runner-side env config
      # (covers a future Gitea host where the env var is forgotten).
      - name: Detect runner host
        id: host
        run: |
          if [[ "${GITEA_ACTIONS:-}" == "true" ]] || [[ "${{ github.server_url }}" == *moleculesai.app* ]] || [[ "${{ github.event.repository.html_url }}" == *moleculesai.app* ]]; then
            echo "is_gitea=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Gitea Actions detected — auto-merge gating is not applicable here (Gitea has no --auto merge primitive). Job will no-op."
          else
            echo "is_gitea=false" >> "$GITHUB_OUTPUT"
          fi
      - name: Disable auto-merge (GitHub only)
        if: steps.host.outputs.is_gitea != 'true'
        env:
          GH_TOKEN: ${{ github.token }}
          PR: ${{ github.event.pull_request.number }}
          REPO: ${{ github.repository }}
          NEW_SHA: ${{ github.sha }}
        run: |
          set -eu
          gh pr merge "$PR" --disable-auto -R "$REPO" || true
          gh pr comment "$PR" -R "$REPO" --body "🔒 Auto-merge disabled — new commit (\`${NEW_SHA:0:7}\`) pushed after auto-merge was enabled. The merge queue locks SHAs at entry, so subsequent pushes can race. Verify the new commit and re-enable with \`gh pr merge --auto\`."
      - name: Gitea no-op
        if: steps.host.outputs.is_gitea == 'true'
        run: echo "Gitea Actions — auto-merge gating not applicable; no-op (job intentionally green so branch protection's required-check name lands SUCCESS)."
--- a/.github/workflows/promote-latest.yml
+++ b/.github/workflows/promote-latest.yml
@ -1,85 +0,0 @@
 name: promote-latest
 # Manually retag ghcr.io/molecule-ai/platform:staging-<sha> →  :latest
 # (and the same for the tenant image). Use this to:
 #
 #   1. Promote a :staging-<sha> to prod before the canary fleet is live
 #      (one-off during the initial rollout).
 #   2. Roll back :latest to a prior known-good digest after a bad
 #      promotion slipped past canary (use scripts/rollback-latest.sh
 #      for a local / emergency path; this workflow is for scheduled
 #      or from-browser promotions).
 #
 # Running this workflow needs no extra secrets — GitHub's default
 # GITHUB_TOKEN has write:packages for repo-owned GHCR images, which
 # is all we need for a remote retag via `crane tag`.
 on:
  workflow_dispatch:
    inputs:
      sha:
        description: 'Short sha to promote (e.g. 4c1d56e). Must match an existing :staging-<sha> tag.'
        required: true
        type: string
 permissions:
  contents: read
  packages: write
 env:
  IMAGE_NAME: ghcr.io/molecule-ai/platform
  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
 jobs:
  promote:
    runs-on: ubuntu-latest
    steps:
      - uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
      - name: GHCR login
        run: |
          echo "${{ secrets.GITHUB_TOKEN }}" \
            | crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
      - name: Retag platform image
        run: |
          set -eu
          SRC="${IMAGE_NAME}:staging-${{ inputs.sha }}"
          if ! crane digest "$SRC" >/dev/null 2>&1; then
            echo "::error::$SRC not found in registry — double-check the sha."
            exit 1
          fi
          EXPECTED=$(crane digest "$SRC")
          crane tag "$SRC" latest
          ACTUAL=$(crane digest "${IMAGE_NAME}:latest")
          if [ "$ACTUAL" != "$EXPECTED" ]; then
            echo "::error::retag digest mismatch (expected $EXPECTED, got $ACTUAL)"
            exit 1
          fi
          echo "OK  ${IMAGE_NAME}:latest → $ACTUAL"
      - name: Retag tenant image
        run: |
          set -eu
          SRC="${TENANT_IMAGE_NAME}:staging-${{ inputs.sha }}"
          if ! crane digest "$SRC" >/dev/null 2>&1; then
            echo "::error::$SRC not found — tenant image may not have built for this sha."
            exit 1
          fi
          EXPECTED=$(crane digest "$SRC")
          crane tag "$SRC" latest
          ACTUAL=$(crane digest "${TENANT_IMAGE_NAME}:latest")
          if [ "$ACTUAL" != "$EXPECTED" ]; then
            echo "::error::tenant retag digest mismatch"
            exit 1
          fi
          echo "OK  ${TENANT_IMAGE_NAME}:latest → $ACTUAL"
      - name: Summary
        run: |
          {
            echo "## :latest promoted to staging-${{ inputs.sha }}"
            echo
            echo "Both platform + tenant images retagged. Prod tenants"
            echo "will auto-pull within their 5-min update cycle."
          } >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/publish-canvas-image.yml
+++ b/.github/workflows/publish-canvas-image.yml
@ -54,6 +54,22 @@ jobs:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
      # Health check: verify Docker daemon is accessible before attempting any
      # build steps. This fails loudly at step 1 when the runner's docker.sock
      # is inaccessible rather than silently continuing to the build step
      # where docker build fails deep in ECR auth with a cryptic error.
      - name: Verify Docker daemon access
        run: |
          set -euo pipefail
          echo "::group::Docker daemon health check"
          docker info 2>&1 | head -5 || {
            echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
            echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
            exit 1
          }
          echo "Docker daemon OK"
          echo "::endgroup::"
      - name: Compute tags
        id: tags
        shell: bash
--- a/.github/workflows/publish-runtime.yml
+++ b/.github/workflows/publish-runtime.yml
@ -1,436 +0,0 @@
 name: publish-runtime
 # Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/.
 # Monorepo workspace/ is the only source-of-truth for runtime code; this
 # workflow is the bridge from monorepo edits to the PyPI artifact that
 # the 8 workspace-template-* repos depend on.
 #
 # Triggered by:
 #   - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from
 #     the tag — `runtime-v0.1.6` publishes `0.1.6`).
 #   - Manual workflow_dispatch with an explicit `version` input (useful for
 #     dev/test releases without tagging the repo).
 #   - Auto: any push to `staging` that touches `workspace/**`. The version
 #     is derived by querying PyPI for the current latest and bumping the
 #     patch component. This closes the human-in-loop gap that caused the
 #     2026-04-27 RuntimeCapabilities ImportError outage — adapter symbol
 #     additions in workspace/adapters/base.py used to require an operator
 #     to remember to publish; now the merge itself triggers the publish.
 #
 # The workflow:
 #   1. Runs scripts/build_runtime_package.py to copy workspace/ →
 #      build/molecule_runtime/ with imports rewritten (`a2a_client` →
 #      `molecule_runtime.a2a_client`).
 #   2. Builds wheel + sdist with `python -m build`.
 #   3. Publishes to PyPI via the PyPA Trusted Publisher action (OIDC).
 #      No static API token is stored — PyPI verifies the workflow's
 #      OIDC claim against the trusted-publisher config registered for
 #      molecule-ai-workspace-runtime (molecule-ai/molecule-core,
 #      publish-runtime.yml, environment pypi-publish).
 #
 # After publish: the 8 template repos pick up the new version on their
 # next image rebuild (their requirements.txt pin
 # `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible).
 # To force-pull immediately, bump the pin in each template repo's
 # requirements.txt and merge — that triggers their own publish-image.yml.
 on:
  push:
    tags:
      - "runtime-v*"
    branches:
      - staging
    paths:
      # Auto-publish when staging gets changes that affect what gets
      # published. Path filter ONLY applies to branch pushes — tag pushes
      # still fire regardless.
      #
      # workspace/** is the source-of-truth for runtime code.
      # scripts/build_runtime_package.py is the build script — changes to
      # it (e.g. a fix to the import rewriter or a manifest emit) directly
      # affect what ships in the wheel even if no workspace/ file changes.
      # The 2026-04-27 lib/ subpackage incident missed an auto-publish for
      # exactly this reason — PR #2174 only changed scripts/ and the
      # operator had to remember a manual dispatch.
      - "workspace/**"
      - "scripts/build_runtime_package.py"
  workflow_dispatch:
    inputs:
      version:
        description: "Version to publish (e.g. 0.1.6). Required for manual dispatch."
        required: true
        type: string
 permissions:
  contents: read
 # Serialize publishes so two staging merges landing seconds apart don't
 # both compute "latest+1" and race on PyPI upload. The second one waits.
 concurrency:
  group: publish-runtime
  cancel-in-progress: false
 jobs:
  publish:
    runs-on: ubuntu-latest
    environment: pypi-publish
    permissions:
      contents: read
      id-token: write   # PyPI Trusted Publisher (OIDC) — no PYPI_TOKEN needed
    outputs:
      version: ${{ steps.version.outputs.version }}
      wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
          cache: pip
      - name: Derive version (tag, manual input, or PyPI auto-bump)
        id: version
        run: |
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            VERSION="${{ inputs.version }}"
          elif echo "$GITHUB_REF_NAME" | grep -q "^runtime-v"; then
            # Tag is `runtime-vX.Y.Z` — strip the prefix.
            VERSION="${GITHUB_REF_NAME#runtime-v}"
          else
            # Auto-publish from staging push. Query PyPI for the current
            # latest and bump the patch component. concurrency: group above
            # serializes parallel staging merges so we don't race on the
            # bump. If PyPI is unreachable, fail loud — better to skip a
            # publish than to overwrite an existing version.
            LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
              | python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
            MAJOR=$(echo "$LATEST" | cut -d. -f1)
            MINOR=$(echo "$LATEST" | cut -d. -f2)
            PATCH=$(echo "$LATEST" | cut -d. -f3)
            VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
            echo "Auto-bumped from PyPI latest $LATEST -> $VERSION"
          fi
          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
            echo "::error::version $VERSION does not match PEP 440"
            exit 1
          fi
          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
          echo "Publishing molecule-ai-workspace-runtime $VERSION"
      - name: Install build tooling
        run: pip install build twine
      - name: Build package from workspace/
        run: |
          python scripts/build_runtime_package.py \
            --version "${{ steps.version.outputs.version }}" \
            --out "${{ runner.temp }}/runtime-build"
      - name: Build wheel + sdist
        working-directory: ${{ runner.temp }}/runtime-build
        run: python -m build
      - name: Capture wheel SHA256 for cascade content-verification
        # Recorded BEFORE upload so the cascade probe can verify the
        # bytes Fastly serves under the new version's URL match what
        # we built. Closes a hole left by #2197: that probe verified
        # pip can resolve the version (catches propagation lag) but
        # not that the wheel content matches (would silently pass a
        # Fastly stale-content scenario where the new version's URL
        # serves an old wheel binary).
        id: wheel_hash
        working-directory: ${{ runner.temp }}/runtime-build
        run: |
          set -eu
          WHEEL=$(ls dist/*.whl 2>/dev/null | head -1)
          if [ -z "$WHEEL" ]; then
            echo "::error::No .whl in dist/ — `python -m build` must have failed silently"
            exit 1
          fi
          HASH=$(sha256sum "$WHEEL" | awk '{print $1}')
          echo "wheel_sha256=${HASH}" >> "$GITHUB_OUTPUT"
          echo "Local wheel SHA256 (pre-upload): ${HASH}"
          echo "Wheel filename: $(basename "$WHEEL")"
      - name: Verify package contents (sanity)
        working-directory: ${{ runner.temp }}/runtime-build
        # Smoke logic lives in scripts/wheel_smoke.py so the same gate runs
        # at both PR-time (runtime-prbuild-compat.yml) and publish-time
        # (here). Splitting the smoke across two heredocs let them drift
        # apart historically — one script keeps them locked.
        run: |
          python -m twine check dist/*
          python -m venv /tmp/smoke
          /tmp/smoke/bin/pip install --quiet dist/*.whl
          /tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
      - name: Publish to PyPI (Trusted Publisher / OIDC)
        # PyPI side is configured: project molecule-ai-workspace-runtime →
        # publisher molecule-ai/molecule-core, workflow publish-runtime.yml,
        # environment pypi-publish. The action mints a short-lived OIDC
        # token and exchanges it for a PyPI upload credential — no static
        # API token in this repo's secrets.
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          packages-dir: ${{ runner.temp }}/runtime-build/dist/
  cascade:
    # After PyPI accepts the upload, fan out a repository_dispatch to each
    # template repo so they rebuild their image against the new runtime.
    # Each template's `runtime-published.yml` receiver picks up the event,
    # pulls the new PyPI version (their requirements.txt pin is `>=`), and
    # republishes ghcr.io/molecule-ai/workspace-template-<runtime>:latest.
    #
    # Soft-fail per repo: if one template's dispatch fails (perms missing,
    # repo archived, etc.) we still try the others and surface the failures
    # in the workflow summary instead of aborting the whole cascade.
    needs: publish
    runs-on: ubuntu-latest
    steps:
      - name: Wait for PyPI to propagate the new version
        # PyPI accepts the upload, then takes a few seconds to make the
        # new version visible across all THREE surfaces pip touches:
        #   1. /pypi/<pkg>/<ver>/json — metadata endpoint
        #   2. /simple/<pkg>/         — pip's primary download index
        #   3. files.pythonhosted.org — CDN-fronted wheel binary
        # Each has its own cache. The previous check polled only (1)
        # and would let the cascade fire while (2) or (3) still served
        # the previous version, so downstream `pip install` resolved
        # to the old wheel. Docker layer cache then locked that stale
        # resolution in for subsequent rebuilds (the cache trap that
        # bit us five times in one night).
        #
        # Two-stage probe per poll:
        #   (a) `pip install --no-cache-dir PACKAGE==VERSION` — succeeds
        #       only when the version is resolvable. Catches surface (1)
        #       and (2) propagation lag.
        #   (b) `pip download` of the same wheel + SHA256 compare against
        #       the just-built dist's hash. Catches surface (3) lag AND
        #       Fastly serving stale content under the new version's URL
        #       (a separate Fastly-corruption mode that pip-install alone
        #       can't see, since pip install resolves+unpacks against
        #       whatever bytes Fastly returns and never inspects them).
        # Both must pass before the cascade fans out.
        #
        # The venv is reused across polls; only `pip install`/`pip
        # download` run in the loop, with --force-reinstall +
        # --no-cache-dir so the previous poll's cached state doesn't
        # mask propagation lag.
        env:
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
          EXPECTED_SHA256: ${{ needs.publish.outputs.wheel_sha256 }}
        run: |
          set -eu
          if [ -z "$EXPECTED_SHA256" ]; then
            echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
            exit 1
          fi
          python -m venv /tmp/propagation-probe
          PROBE=/tmp/propagation-probe/bin
          $PROBE/pip install --upgrade --quiet pip
          # Poll budget: 30 attempts × (~3-5s pip install + ~3s pip
          # download + 4s sleep) ≈ 5-6 min wall on a slow GH runner.
          # Generous vs PyPI's typical few-seconds propagation;
          # failures past this are signal of a real PyPI / Fastly
          # issue, not just lag.
          for i in $(seq 1 30); do
            # Stage (a): can pip resolve and install the version?
            if $PROBE/pip install \
                  --quiet \
                  --no-cache-dir \
                  --force-reinstall \
                  --no-deps \
                  "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
                  >/dev/null 2>&1; then
              INSTALLED=$($PROBE/pip show molecule-ai-workspace-runtime 2>/dev/null \
                          | awk -F': ' '/^Version:/{print $2}')
              if [ "$INSTALLED" = "$RUNTIME_VERSION" ]; then
                # Stage (b): does Fastly serve the bytes we uploaded?
                # `pip download` writes the actual .whl file to disk so
                # we can sha256sum it (vs `pip install` which unpacks
                # and discards).
                rm -rf /tmp/probe-dl
                mkdir -p /tmp/probe-dl
                if $PROBE/pip download \
                      --quiet \
                      --no-cache-dir \
                      --no-deps \
                      --dest /tmp/probe-dl \
                      "molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
                      >/dev/null 2>&1; then
                  WHEEL=$(ls /tmp/probe-dl/*.whl 2>/dev/null | head -1)
                  if [ -n "$WHEEL" ]; then
                    ACTUAL=$(sha256sum "$WHEEL" | awk '{print $1}')
                    if [ "$ACTUAL" = "$EXPECTED_SHA256" ]; then
                      echo "::notice::✓ pip resolves AND wheel content matches after ${i} poll(s) (sha256=${EXPECTED_SHA256})"
                      exit 0
                    fi
                    # Hash mismatch: PyPI accepted our upload but Fastly
                    # is serving different bytes under the version's URL.
                    # Most often this is propagation lag of the BINARY
                    # surface — the version is resolvable but the wheel
                    # cache hasn't caught up. Retry.
                    echo "::warning::poll ${i}: wheel content mismatch (got ${ACTUAL:0:12}…, want ${EXPECTED_SHA256:0:12}…) — Fastly likely still serving stale binary, retrying"
                  fi
                fi
              fi
            fi
            sleep 4
          done
          echo "::error::pip never resolved molecule-ai-workspace-runtime==${RUNTIME_VERSION} with matching wheel content within ~5 min."
          echo "::error::Expected wheel SHA256: ${EXPECTED_SHA256}"
          echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces."
          exit 1
      - name: Fan out via push to .runtime-version
        env:
          # Gitea PAT with write:repository scope on the 8 cascade-active
          # template repos. Used here for `git push` (NOT for an API
          # dispatch — Gitea 1.22.6 has no repository_dispatch endpoint;
          # empirically verified across 6 candidate paths in molecule-
          # core#20 issuecomment-913). The push trips each template's
          # existing `on: push: branches: [main]` trigger on
          # publish-image.yml, which then reads the updated
          # .runtime-version via its resolve-version job.
          DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
          RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
        run: |
          set +e   # don't abort on a single repo failure — collect them all
          # Soft-skip on workflow_dispatch when the token is missing
          # (operator ad-hoc test); hard-fail on push so unattended
          # publishes can't silently skip the cascade. Same shape as
          # the original v1, intentional split per the schedule-vs-
          # dispatch hardening 2026-04-28.
          if [ -z "$DISPATCH_TOKEN" ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
              echo "::warning::set it at Settings → Secrets and Variables → Actions, then rerun. Templates will stay on the prior runtime version until either this token is set or each template is rebuilt manually."
              exit 0
            fi
            echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
            echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version until this token is restored and a republish dispatches the cascade."
            echo "::error::set it at Settings → Secrets and Variables → Actions; then re-trigger publish-runtime via workflow_dispatch."
            exit 1
          fi
          VERSION="$RUNTIME_VERSION"
          if [ -z "$VERSION" ]; then
            echo "::error::publish job did not expose a version output — cascade cannot fan out"
            exit 1
          fi
          # All 9 workspace templates declared in manifest.json. The list
          # MUST stay aligned with manifest.json's workspace_templates —
          # cascade-list-drift-gate.yml enforces this in CI per the
          # codex-stuck-on-stale-runtime invariant from PR #2556.
          # Long-term goal: derive this list from manifest.json so it
          # can't drift even on a manifest edit (RFC #388 Phase-1).
          #
          # Per-template publish-image.yml presence is checked at
          # cascade-time below: codex doesn't ship one today, so the
          # cascade soft-skips it with an informational message rather
          # than dropping it from this list (which would re-introduce
          # the drift the gate exists to catch).
          GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
          TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
          FAILED=""
          SKIPPED=""
          # Configure git identity once. The persona owning DISPATCH_TOKEN
          # is the same identity that authored this commit on each
          # template; using a generic "publish-runtime cascade" co-author
          # trailer in the message keeps the audit trail honest about the
          # workflow-driven origin.
          git config --global user.name  "publish-runtime cascade"
          git config --global user.email "publish-runtime@moleculesai.app"
          WORKDIR="$(mktemp -d)"
          for tpl in $TEMPLATES; do
            REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
            CLONE="$WORKDIR/$tpl"
            # Pre-check: skip templates without a publish-image.yml.
            # The cascade's job is to trip the template's on-push
            # rebuild — if there's no rebuild workflow, pushing a
            # .runtime-version commit is just noise on the target
            # repo. Use the Gitea contents API (no clone required for
            # the probe). 200 = present; 404 = absent.
            HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
              -H "Authorization: token $DISPATCH_TOKEN" \
              "$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
            if [ "$HTTP" = "404" ]; then
              echo "↷ $tpl has no publish-image.yml — soft-skip (informational; manifest still tracks it)"
              SKIPPED="$SKIPPED $tpl"
              continue
            fi
            if [ "$HTTP" != "200" ]; then
              echo "::warning::$tpl publish-image.yml probe returned HTTP $HTTP — proceeding anyway, push will surface the real failure if any"
            fi
            # Use a per-template attempt loop so a transient race (e.g.
            # human pushing to the same template at the same instant)
            # doesn't lose the cascade. Bounded retries (3) — beyond
            # that we surface the failure and let the operator retry.
            attempt=0
            success=false
            while [ $attempt -lt 3 ]; do
              attempt=$((attempt + 1))
              rm -rf "$CLONE"
              if ! git clone --depth=1 \
                  "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
                  "$CLONE" >/tmp/clone.log 2>&1; then
                echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
                sleep 2
                continue
              fi
              cd "$CLONE"
              echo "$VERSION" > .runtime-version
              # Idempotency guard: if the file already matches, this
              # publish is a re-run for a version already cascaded.
              # Don't push a no-op commit (would spuriously re-trip the
              # template's on-push and rebuild for nothing).
              if git diff --quiet -- .runtime-version; then
                echo "✓ $tpl already at $VERSION — no commit needed (idempotent)"
                success=true
                cd - >/dev/null
                break
              fi
              git add .runtime-version
              git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
                -m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
                >/dev/null
              if git push origin HEAD:main >/tmp/push.log 2>&1; then
                echo "✓ $tpl pushed $VERSION on attempt $attempt"
                success=true
                cd - >/dev/null
                break
              fi
              # Likely a non-fast-forward — pull-rebase and retry.
              # Don't force-push: that would silently overwrite a racing
              # human/cascade commit.
              echo "::warning::push $tpl attempt $attempt failed, pull-rebasing: $(tail -n3 /tmp/push.log)"
              git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
              cd - >/dev/null
            done
            if [ "$success" != "true" ]; then
              FAILED="$FAILED $tpl"
            fi
          done
          rm -rf "$WORKDIR"
          if [ -n "$FAILED" ]; then
            echo "::error::Cascade incomplete after 3 retries each. Failed templates:$FAILED"
            echo "::error::PyPI publish succeeded; failed templates lag the new version. Re-run this workflow_dispatch with the same version to retry only the laggers (idempotent — already-cascaded templates skip)."
            exit 1
          fi
          if [ -n "$SKIPPED" ]; then
            echo "Cascade complete: pinned $VERSION on cascade-active templates. Soft-skipped (no publish-image.yml):$SKIPPED"
          else
            echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
          fi
--- a/.github/workflows/publish-workspace-server-image.yml
+++ b/.github/workflows/publish-workspace-server-image.yml
@ -1,262 +0,0 @@
 name: publish-workspace-server-image
 # Builds and pushes Docker images to GHCR on staging or main pushes.
 # EC2 tenant instances pull the tenant image from GHCR.
 #
 # Branch / tag policy (see Compute tags step for the per-branch logic):
 #
 #   staging push  → builds image, tags :staging-<sha> + :staging-latest.
 #                   staging-CP pins TENANT_IMAGE=:staging-latest, so it
 #                   picks up staging-branch code automatically. This is
 #                   what makes staging-CP actually test staging-branch
 #                   code instead of "yesterday's main" — pre-fix, this
 #                   workflow only ran on main, so staging tenants
 #                   silently served stale code (#2308 fix RFC #2312
 #                   landed on staging but never reached tenants because
 #                   staging→main was wedged on path-filter parity bugs).
 #
 #   main push     → builds image, tags :staging-<sha> + :staging-latest
 #                   (same as before). canary-verify.yml retags
 #                   :staging-<sha> → :latest after canary tenants
 #                   green-light the digest. The :staging-latest retag
 #                   on main push is intentional: when main lands AFTER a
 #                   staging push, staging-CP gets the post-promote code
 #                   (which equals what it had + any merge resolution),
 #                   so the canary-on-staging-CP step still runs against
 #                   the prod-bound digest.
 #
 # In the steady state both branches refresh :staging-latest; the
 # semantic is "most recent staging-or-main build of tenant code."
 # Drift between the two is bounded by the staging→main auto-promote
 # cadence and is corrected on the next staging push.
 on:
  push:
    branches: [staging, main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
      - 'scripts/**'
      - '.github/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:
 # Serialize per-branch so two rapid staging pushes don't race the same
 # :staging-latest tag retag. Allow staging and main to run in parallel
 # (different github.ref → different concurrency group) since they
 # produce different :staging-<sha> tags and last-write-wins on
 # :staging-latest is acceptable across branches (the post-promote
 # main code equals current staging code in a healthy flow).
 #
 # cancel-in-progress: false → in-flight builds finish; the next push's
 # build queues. This avoids a partially-pushed image and keeps the
 # canary fleet pin (:staging-<sha>) consistent with what was actually
 # tested at canary-verify time.
 concurrency:
  group: publish-workspace-server-image-${{ github.ref }}
  cancel-in-progress: false
 permissions:
  contents: read
  packages: write
 env:
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # plugin was dropped + workspace-server/Dockerfile no longer
      # COPYs it.
      # ECR auth + buildx setup are now inline in each build step
      # below (Task #173, 2026-05-07).
      #
      # Why moved inline: aws-actions/configure-aws-credentials@v4 +
      # aws-actions/amazon-ecr-login@v2 + docker/setup-buildx-action
      # all left auth state in places that the actual `docker push`
      # couldn't see on Gitea Actions:
      #   - The actions wrote to a step-scoped DOCKER_CONFIG path
      #     that didn't survive into subsequent shell steps.
      #   - Buildx couldn't bridge the runner container ↔
      #     operator-host docker daemon auth gap (401 on the
      #     docker-container driver, "no basic auth credentials"
      #     with the action-driven login).
      #
      # Doing AWS+ECR auth inline (`aws ecr get-login-password |
      # docker login`) in the same shell step as `docker build` +
      # `docker push` is the operator-host manual approach, mapped
      # 1:1 into CI. Auth state is guaranteed to live in the env that
      # `docker push` actually runs from.
      #
      # Post-suspension target is the operator's ECR org
      # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*),
      # which already hosts platform-tenant + workspace-template-* +
      # runner-base images. AWS creds come from the
      # AWS_ACCESS_KEY_ID/SECRET secrets bound to the molecule-cp
      # IAM user. Closes #161.
      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
      # Pre-clone manifest deps before docker build (Task #173 fix).
      #
      # Why pre-clone: post-2026-05-06, every workspace-template-* repo on
      # Gitea (codex, crewai, deepagents, gemini-cli, langgraph) plus all
      # 7 org-template-* repos are private. The pre-fix Dockerfile.tenant
      # ran `git clone` inside an in-image stage, which had no auth path
      # — every CI build failed with "fatal: could not read Username for
      # https://git.moleculesai.app". For weeks, every workspace-server
      # rebuild required a manual operator-host push. Now we clone in the
      # trusted CI context (where AUTO_SYNC_TOKEN is naturally available)
      # and Dockerfile.tenant just COPYs from .tenant-bundle-deps/.
      #
      # Token shape: AUTO_SYNC_TOKEN is the devops-engineer persona PAT
      # (see /etc/molecule-bootstrap/agent-secrets.env). Per saved memory
      # `feedback_per_agent_gitea_identity_default`, every CI surface uses
      # a per-persona token, never the founder PAT. clone-manifest.sh
      # embeds it as basic-auth (oauth2:<token>) for the duration of the
      # clones, then strips .git directories — the token never enters
      # the resulting image.
      #
      # Idempotent: if a re-run finds populated dirs, clone-manifest.sh
      # skips them; safe to retrigger via path-filter or workflow_dispatch.
      - name: Pre-clone manifest deps
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
            echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
            exit 1
          fi
          mkdir -p .tenant-bundle-deps
          bash scripts/clone-manifest.sh \
            manifest.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          # Sanity-check counts so a silent partial clone fails fast
          # instead of producing a half-empty image.
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
          # Counts are derived from manifest.json (9 ws / 7 org / 21
          # plugins as of 2026-05-07). If manifest.json grows but the
          # clone step regresses silently, the find above caps at the
          # actual disk state — but clone-manifest.sh's own EXPECTED vs
          # CLONED check (line ~95) is the authoritative fail-fast.
      # Canary-gated release flow:
      #   - This step always publishes :staging-<sha> + :staging-latest.
      #   - On staging push, staging-CP picks up :staging-latest immediately
      #     (its TENANT_IMAGE pin is :staging-latest) — so staging-branch
      #     code reaches staging tenants without waiting for main.
      #   - On main push, canary-verify.yml runs smoke tests against
      #     canary tenants (which pin :staging-<sha>), and on green retags
      #     :staging-<sha> → :latest. Prod tenants pull :latest.
      #   - On red, :latest stays on the prior good digest — prod is safe.
      #
      # Why :staging-latest is retagged on main push too: when main lands
      # after a staging promote, staging-CP gets the post-promote code so
      # the canary-on-staging-CP step still runs against the prod-bound
      # digest. In a healthy flow the post-promote main code == the
      # current staging code, so this is effectively a no-op except for
      # the canary fleet pin handoff.
      #
      # Pre-fix history: this workflow used to only trigger on main. That
      # meant staging-CP served "yesterday's main" indefinitely whenever
      # staging→main was wedged. The 2026-04-30 dogfooding session
      # surfaced this when RFC #2312 (chat upload HTTP-forward) landed on
      # staging but staging tenants kept failing chat upload because they
      # were running pre-RFC code. Adding the staging trigger above closes
      # that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
      # drifted 10 days behind staging — same class of bug, different
      # mechanism. ECR repo molecule-ai/platform created 2026-05-07.
      # Build + push platform image with plain `docker` (no buildx).
      # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
      # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
      # The OCI revision label below carries the same value for registry
      # tooling; the duplication is intentional.
      - name: Build & push platform image to ECR (staging-<sha> + staging-latest)
        env:
          IMAGE_NAME: ${{ env.IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          # ECR auth in-step so config.json is populated in the same
          # shell env that runs `docker push`. ECR get-login-password
          # tokens last 12h, plenty for a single-step build+push.
          ECR_REGISTRY="${IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker build \
            --file ./workspace-server/Dockerfile \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify" \
            --tag "${IMAGE_NAME}:${TAG_SHA}" \
            --tag "${IMAGE_NAME}:${TAG_LATEST}" \
            .
          docker push "${IMAGE_NAME}:${TAG_SHA}"
          docker push "${IMAGE_NAME}:${TAG_LATEST}"
      # Canvas uses same-origin fetches. The tenant Go platform
      # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
      # env; the tenant's /canvas/viewport, /approvals/pending,
      # /org/templates etc. live on the tenant platform itself.
      # Both legs share one origin (the tenant subdomain) so
      # PLATFORM_URL="" forces canvas to fetch paths as relative,
      # which land same-origin.
      #
      # Self-hosted / private-label deployments override this at
      # build time with a specific backend (e.g. local dev:
      # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
      - name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
        env:
          TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          # Re-login: the platform-image step's docker login wrote to
          # the same config.json, so this is technically redundant — but
          # making each push step self-contained keeps the workflow
          # robust to step reordering / future extraction.
          ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker build \
            --file ./workspace-server/Dockerfile.tenant \
            --build-arg NEXT_PUBLIC_PLATFORM_URL= \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
            .
          docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}"
          docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"
--- a/.github/workflows/redeploy-tenants-on-main.yml
+++ b/.github/workflows/redeploy-tenants-on-main.yml
@ -3,9 +3,9 @@ name: redeploy-tenants-on-main
 # Auto-refresh prod tenant EC2s after every main merge.
 #
 # Why this workflow exists: publish-workspace-server-image builds and
-# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
+# pushes a new platform-tenant :<sha> to ECR on every merge to main,
-# to main, but running tenants pulled their image once at boot and
+# but running tenants pulled their image once at boot and never re-pull.
-# never re-pull. Users see stale code indefinitely.
+# Users see stale code indefinitely.
 #
 # This workflow closes the gap by calling the control-plane admin
 # endpoint that performs a canary-first, batched, health-gated rolling
@ -13,12 +13,18 @@ name: redeploy-tenants-on-main
 # molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
 # (feat/tenant-auto-redeploy, landing alongside this workflow).
 #
 # Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/
 # molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the
 # Gitea suspension migration. The canary-verify.yml promote step now
 # uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap).
 #
 # Runtime ordering:
-#   1. publish-workspace-server-image completes → new :latest in GHCR.
+#   1. publish-workspace-server-image completes → new :staging-<sha> in ECR.
-#   2. This workflow fires via workflow_run, waits 30s for GHCR's
+#   2. This workflow fires via workflow_run, calls redeploy-fleet with
-#      CDN to propagate the new tag to the region the tenants pull from.
+#      target_tag=staging-<sha>. No CDN propagation wait needed —
-#   3. Calls redeploy-fleet with canary_slug=hongming and a 60s
+#      ECR image manifest is consistent immediately after push.
-#      soak. Canary proves the image boots; batches follow.
+#   3. Calls redeploy-fleet with canary_slug (if set) and a soak
 #      period. Canary proves the image boots; batches follow.
 #   4. Any failure aborts the rollout and leaves older tenants on the
 #      prior image — safer default than half-and-half state.
 #
@ -108,13 +114,11 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 25
    steps:
-      - name: Wait for GHCR tag propagation
+      - name: Note on ECR propagation
-        # GHCR's edge cache takes ~15-30s to consistently serve the new
+        # ECR image manifests are consistent immediately after push — no
-        # manifest after the registry accepts the push. Without this
+        # CDN cache to wait for. The old GHCR-based workflow had a 30s
-        # sleep, the first tenant's docker pull sometimes races and
+        # sleep to avoid race conditions; ECR makes that unnecessary.
-        # fetches the previous digest; sleeping is the cheapest way to
+        run: echo "ECR image available immediately after push — proceeding."
        # reduce that without polling GHCR for the new digest.
        run: sleep 30
      - name: Compute target tag
        id: tag
--- a/.github/workflows/retarget-main-to-staging.yml
+++ b/.github/workflows/retarget-main-to-staging.yml
@ -1,276 +0,0 @@
 name: Retarget main PRs to staging
 # Mechanical enforcement of SHARED_RULES rule 8 ("Staging-first
 # workflow, no exceptions"). When a bot opens a PR against `main`,
 # retarget it to `staging` automatically and leave an explanatory
 # comment. Human / CEO-authored PRs (the staging→main promotion
 # PRs, etc.) are left alone — they're the authorised exception
 # to the rule.
 #
 # ============================================================
 # What this workflow does
 # ============================================================
 #
 # On `pull_request_target` opened/reopened against `main`:
 #   1. If the PR head is `staging`, skip (the auto-promote PRs
 #      MUST stay base=main).
 #   2. If the PR author is a bot, retarget the PR base to
 #      `staging` via Gitea REST `PATCH /pulls/{N}` body
 #      `{"base":"staging"}`.
 #   3. If the retarget returns 422 "pull request already exists
 #      for base branch 'staging'" (issue #1884 case: another PR
 #      on the same head already targets staging), close the
 #      now-redundant main-PR via Gitea REST instead of failing
 #      red.
 #   4. Post an explainer comment on the retargeted PR via
 #      Gitea REST `POST /issues/{N}/comments`.
 #
 # ============================================================
 # Why Gitea REST (and not `gh api / gh pr close / gh pr comment`)
 # ============================================================
 #
 # Pre-2026-05-06 this workflow used `gh api -X PATCH "repos/{owner}/{repo}/pulls/{N}" -f base=staging`
 # plus `gh pr close` and `gh pr comment`. After the GitHub→Gitea
 # cutover those calls fail because:
 #
 #   - `gh` CLI defaults to `api.github.com`. Even with `GH_HOST`
 #     pointing at Gitea, `gh pr close / comment` route through
 #     GraphQL (`/api/graphql`) which Gitea does not expose.
 #     Empirical: every `gh pr *` call returns
 #     `HTTP 405 Method Not Allowed (https://git.moleculesai.app/api/graphql)`
 #     — same root cause as #65 (auto-sync, fixed in PR #66) and
 #     #73/#195 (auto-promote, fixed in PR #78).
 #   - `gh api -X PATCH /pulls/{N}` happens to use a REST path
 #     that Gitea also has, but the `gh` host-resolution layer
 #     and pagination/retry logic don't always hit Gitea cleanly,
 #     and the cost of switching to direct `curl` is one extra
 #     line of code.
 #
 # So this workflow uses direct `curl` calls to Gitea REST. No
 # `gh` CLI dependency, no GraphQL, no flaky host-resolution.
 #
 # ============================================================
 # Identity + token (anti-bot-ring per saved-memory
 # `feedback_per_agent_gitea_identity_default`)
 # ============================================================
 #
 # Pre-fix this workflow used the per-job ephemeral
 # `secrets.GITHUB_TOKEN`. On Gitea Actions that token has
 # narrow scope and unpredictable cross-PR write capability.
 #
 # Post-fix: `secrets.AUTO_SYNC_TOKEN` (the `devops-engineer`
 # Gitea persona). Same persona used by `auto-sync-main-to-staging.yml`
 # (PR #66) and `auto-promote-staging.yml` (PR #78). Token scope:
 # `push: true` repo write, sufficient for PR-edit + close + comment.
 #
 # Why this token does NOT need branch-protection bypass:
 # patching a PR's base ref is a PR-level operation that does not
 # require push perms on either branch (the PR's own commits stay
 # put; only the metadata changes).
 #
 # ============================================================
 # Failure modes & operational notes
 # ============================================================
 #
 # A — PATCH base→staging returns 422 "pull request already exists"
 #     (issue #1884 case):
 #     - Detected by string-match on response body. Workflow
 #       falls through to closing the now-redundant main-PR
 #       (Gitea REST `PATCH /pulls/{N}` with `state: closed`)
 #       and posts an explanation comment. Step summary surfaces.
 #
 # B — `AUTO_SYNC_TOKEN` rotated / wrong scope:
 #     - First REST call returns 401/403. Step summary surfaces.
 #       Re-issue token from `~/.molecule-ai/personas/` on the
 #       operator host and update repo Actions secret.
 #
 # C — PR was deleted between trigger and run:
 #     - REST call returns 404. Workflow exits 0 with a notice
 #       (the rule was already enforced or the PR is gone).
 #
 # D — author is not actually a bot but the filter mis-fires:
 #     - Filter is conservative: only triggers on
 #       `user.type == 'Bot'`, `login` ends with `[bot]`, or
 #       known bot logins (`molecule-ai[bot]`, `app/molecule-ai`).
 #       Human PRs slip through unaffected. If a NEW bot login
 #       starts shipping main-PRs, add it to the filter.
 on:
  pull_request_target:
    types: [opened, reopened]
    branches: [main]
 permissions:
  pull-requests: write
 jobs:
  retarget:
    name: Retarget to staging
    runs-on: ubuntu-latest
    # Only fire for bot-authored PRs. Human CEO PRs (staging→main
    # promotion) are intentional and pass through.
    #
    # Head-ref guard: never retarget a PR whose head IS `staging`
    # — those are the auto-promote staging→main PRs (opened by
    # `devops-engineer` since PR #78 / #195 fix). Retargeting
    # head=staging onto base=staging fails with HTTP 422 "no new
    # commits between base 'staging' and head 'staging'", which
    # would surface as a noisy red workflow run on every
    # auto-promote (caught 2026-05-03 on the GitHub-era PR #2588).
    if: >-
      github.event.pull_request.head.ref != 'staging'
      && (
        github.event.pull_request.user.type == 'Bot'
        || endsWith(github.event.pull_request.user.login, '[bot]')
        || github.event.pull_request.user.login == 'app/molecule-ai'
        || github.event.pull_request.user.login == 'molecule-ai[bot]'
        || github.event.pull_request.user.login == 'devops-engineer'
      )
    steps:
      - name: Retarget PR base to staging via Gitea REST
        id: retarget
        env:
          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
        # Issue #1884 case: when the bot opens a PR against main
        # and there's already another PR on the same head branch
        # targeting staging, Gitea's PATCH returns 422 with a
        # body mentioning "pull request already exists for base
        # branch 'staging'" (the Gitea message wording is
        # slightly different from GitHub's; the substring match
        # below covers both for forward/back compat).
        # The retarget can't proceed — but the right response is
        # to close the now-redundant main-PR, not to fail the
        # workflow noisily. Detect that specific 422 and close
        # instead.
        run: |
          set -euo pipefail
          API="${GITEA_HOST}/api/v1/repos/${REPO}"
          AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
          echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging"
          # Curl-status-capture pattern per `feedback_curl_status_capture_pollution`:
          # http_code via -w to its own scalar, body to a tempfile, set +e/-e
          # bracket so curl's non-zero-on-4xx doesn't pollute the script's exit chain.
          BODY_FILE=$(mktemp)
          REQ='{"base":"staging"}'
          set +e
          STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
            -X PATCH -d "${REQ}" \
            -o "${BODY_FILE}" -w "%{http_code}" \
            "${API}/pulls/${PR_NUMBER}")
          CURL_RC=$?
          set -e
          if [ "${CURL_RC}" -ne 0 ]; then
            echo "::error::curl PATCH failed (rc=${CURL_RC})"
            rm -f "${BODY_FILE}"
            exit 1
          fi
          if [ "${STATUS}" = "201" ] || [ "${STATUS}" = "200" ]; then
            NEW_BASE=$(jq -r '.base.ref // "?"' < "${BODY_FILE}")
            rm -f "${BODY_FILE}"
            if [ "${NEW_BASE}" = "staging" ]; then
              echo "::notice::Retargeted PR #${PR_NUMBER} → staging"
              echo "outcome=retargeted" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::PATCH returned ${STATUS} but base.ref is '${NEW_BASE}', not 'staging'"
            exit 1
          fi
          # Specifically match the 422 duplicate-base/head error so
          # any OTHER PATCH failure (auth, deleted PR, etc.) still
          # surfaces as a real workflow failure.
          BODY=$(cat "${BODY_FILE}" || true)
          rm -f "${BODY_FILE}"
          if [ "${STATUS}" = "422" ] && echo "${BODY}" | grep -qE "(pull request already exists for base branch 'staging'|already exists.*base.*staging)"; then
            echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant."
            # Close the now-redundant main-PR via Gitea REST
            # (PATCH state=closed). Post comment explaining
            # rationale BEFORE close so the comment lands on the
            # PR (commenting on a closed PR works on Gitea, but
            # historically caused notification ordering surprises).
            CLOSE_BODY_FILE=$(mktemp)
            CMT_REQ=$(jq -n '{body:"[retarget-bot] Closing — another PR on the same head branch already targets `staging`. This PR is redundant. See issue #1884 for the rationale."}')
            set +e
            CMT_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
              -X POST -d "${CMT_REQ}" \
              -o "${CLOSE_BODY_FILE}" -w "%{http_code}" \
              "${API}/issues/${PR_NUMBER}/comments")
            set -e
            if [ "${CMT_STATUS}" != "201" ]; then
              echo "::warning::dup-close comment POST returned ${CMT_STATUS}; continuing to close anyway"
              cat "${CLOSE_BODY_FILE}" | head -c 300 || true
            fi
            rm -f "${CLOSE_BODY_FILE}"
            CLOSE_REQ='{"state":"closed"}'
            CLOSE_RESP=$(mktemp)
            set +e
            CL_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
              -X PATCH -d "${CLOSE_REQ}" \
              -o "${CLOSE_RESP}" -w "%{http_code}" \
              "${API}/pulls/${PR_NUMBER}")
            set -e
            if [ "${CL_STATUS}" = "201" ] || [ "${CL_STATUS}" = "200" ]; then
              echo "::notice::Closed PR #${PR_NUMBER} as redundant"
              echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT"
              rm -f "${CLOSE_RESP}"
              exit 0
            fi
            echo "::error::Failed to close redundant PR: HTTP ${CL_STATUS}"
            cat "${CLOSE_RESP}" | head -c 300 || true
            rm -f "${CLOSE_RESP}"
            exit 1
          fi
          echo "::error::Retarget PATCH failed and was NOT a duplicate-base error: HTTP ${STATUS}"
          echo "${BODY}" | head -c 500 >&2
          exit 1
      - name: Post explainer comment
        if: steps.retarget.outputs.outcome == 'retargeted'
        env:
          GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
          GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
          REPO: ${{ github.repository }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          set -euo pipefail
          API="${GITEA_HOST}/api/v1/repos/${REPO}"
          AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
          # PR comments live on the issue endpoint in Gitea
          # (PRs ARE issues — same endpoint, different sub-resources
          # for diffs/files/etc.). The body uses jq to safely
          # encode the multi-line markdown without shell-quote
          # nightmares.
          REQ=$(jq -n '{body:"[retarget-bot] This PR was opened against `main` and has been retargeted to `staging` automatically.\n\n**Why:** per [SHARED_RULES rule 8](https://git.moleculesai.app/molecule-ai/molecule-ai-org-template-molecule-dev/src/branch/main/SHARED_RULES.md), all feature work targets `staging` first; the CEO promotes `staging → main` separately.\n\n**What changed:** just the base branch — no code change. CI will re-run against `staging`. If you get merge conflicts, rebase on `staging`.\n\n**If this PR is the CEO`s staging→main promotion:** the Action skipped you (only bot-authored PRs are retargeted, head=staging is also exempted). If you see this comment on your CEO PR, that`s a bug — please tag @hongmingwang."}')
          BODY_FILE=$(mktemp)
          set +e
          STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
            -X POST -d "${REQ}" \
            -o "${BODY_FILE}" -w "%{http_code}" \
            "${API}/issues/${PR_NUMBER}/comments")
          set -e
          if [ "${STATUS}" = "201" ]; then
            echo "::notice::Posted explainer comment on PR #${PR_NUMBER}"
          else
            echo "::warning::Failed to post explainer (HTTP ${STATUS}) — retarget itself succeeded"
            cat "${BODY_FILE}" | head -c 300 || true
          fi
          rm -f "${BODY_FILE}"
--- a/.github/workflows/secret-pattern-drift.yml
+++ b/.github/workflows/secret-pattern-drift.yml
@ -48,7 +48,7 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 5
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
--- a/README.md
+++ b/README.md
@ -284,7 +284,7 @@ cp .env.example .env
 ./infra/scripts/setup.sh
 # Boots Postgres (:5432), Redis (:6379), Langfuse (:3001),
 # and Temporal (:7233 gRPC, :8233 UI) on the shared
-# `molecule-monorepo-net` Docker network. Temporal runs with
+# `molecule-core-net` Docker network. Temporal runs with
 # no auth on localhost — dev-only; production must gate it.
 #
 # Also populates the template/plugin registry by cloning every repo
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@ -283,7 +283,7 @@ cp .env.example .env
 ./infra/scripts/setup.sh
 # 启动 Postgres (:5432)、Redis (:6379)、Langfuse (:3001)
 # 以及 Temporal (:7233 gRPC, :8233 UI)，全部挂在共享的
-# `molecule-monorepo-net` Docker 网络上。Temporal 默认无鉴权，
+# `molecule-core-net` Docker 网络上。Temporal 默认无鉴权，
 # 仅用于本地开发；生产环境必须加 mTLS / API Key。
 #
 # 同时会根据 manifest.json 拉取所有模板/插件仓库到
--- a/canvas/.dockerignore
+++ b/canvas/.dockerignore
@ -0,0 +1,10 @@
 # Excluded from `docker build` context. Without this, the COPY . . step in
 # canvas/Dockerfile clobbers the freshly-installed node_modules with the
 # host's (potentially broken / wrong-arch) copy — the @tailwindcss/oxide
 # native binary disagreed and broke `next build`.
 node_modules
 .next
 .git
 *.log
 .env*
 !.env.example
--- a/canvas/Dockerfile
+++ b/canvas/Dockerfile
@ -1,7 +1,11 @@
-FROM node:22-alpine AS builder
+FROM node:22-alpine@sha256:cb15fca92530d7ac113467696cf1001208dac49c3c64355fd1348c11a88ddf8f AS builder
 WORKDIR /app
 COPY package.json package-lock.json* ./
-RUN npm install
+# `npm ci` (not `install`) for lockfile-exact reproducibility.
 # `--include=optional` ensures the platform-specific @tailwindcss/oxide
 # native binary lands — without it, postcss fails with "Cannot read
 # properties of undefined (reading 'All')" at build time.
 RUN npm ci --include=optional
 COPY . .
 ARG NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080
 ARG NEXT_PUBLIC_WS_URL=ws://localhost:8080/ws
@ -11,7 +15,7 @@ ENV NEXT_PUBLIC_WS_URL=$NEXT_PUBLIC_WS_URL
 ENV NEXT_PUBLIC_ADMIN_TOKEN=$NEXT_PUBLIC_ADMIN_TOKEN
 RUN npm run build
-FROM node:22-alpine
+FROM node:22-alpine@sha256:cb15fca92530d7ac113467696cf1001208dac49c3c64355fd1348c11a88ddf8f
 WORKDIR /app
 COPY --from=builder /app/.next/standalone ./
 COPY --from=builder /app/.next/static ./.next/static
--- a/canvas/next.config.ts
+++ b/canvas/next.config.ts
@ -17,6 +17,24 @@ import { dirname, join } from "node:path";
 // update one heuristic. Production is unaffected: `output: "standalone"`
 // bakes resolved env into the build, and the marker file isn't shipped.
 loadMonorepoEnv();
 // Boot-time matched-pair guard for ADMIN_TOKEN / NEXT_PUBLIC_ADMIN_TOKEN.
 // When ADMIN_TOKEN is set on the workspace-server (server-side bearer
 // gate, wsauth_middleware.go ~L245), the canvas MUST send the matching
 // NEXT_PUBLIC_ADMIN_TOKEN as `Authorization: Bearer ...` on every API
 // call. If only one is set, every workspace API call 401s silently —
 // the canvas hydrates with empty data and the user sees a broken page
 // with no console hint about the auth-config mismatch.
 //
 // Pre-fix the matched-pair contract was descriptive only (a comment in
 // .env): future devs/agents could re-misconfigure with one of the two
 // unset and silently 401. Closes the post-PR-#174 self-review gap.
 //
 // Warn-only (not exit) — production canvas Docker images bake these
 // vars into the build at image-build time, and a missed pair there
 // would still emit the warning at runtime via the standalone server's
 // startup. Killing the process on misconfiguration would turn a
 // recoverable auth issue into a hard crashloop.
 checkAdminTokenPair();
 const nextConfig: NextConfig = {
  output: "standalone",
@ -57,6 +75,43 @@ function loadMonorepoEnv() {
  );
 }
 // Boot-time matched-pair guard. Runs after .env has been loaded so the
 // check sees the post-load state. The two env vars must be set or
 // unset together; one-without-the-other is the silent-401 footgun.
 //
 // Treats empty string ("") as unset. An explicitly-empty `KEY=` in
 // .env counts as set-to-empty in `process.env`, but for auth purposes
 // an empty bearer token is equivalent to no token — so both
 // `ADMIN_TOKEN=` and an unset ADMIN_TOKEN are equivalent relative to
 // the matched-pair invariant.
 //
 // Returns void; side effect is the console.error warning. Kept as a
 // separate function (exported) so a future test can reset env, call
 // this, and assert on captured stderr.
 export function checkAdminTokenPair(): void {
  const serverSet = !!process.env.ADMIN_TOKEN;
  const clientSet = !!process.env.NEXT_PUBLIC_ADMIN_TOKEN;
  if (serverSet === clientSet) return;
  // Distinct messages so the operator can tell which half is missing
  // — the fix is symmetric (set the other one) but the diagnostic
  // mentions which side is currently set so they don't have to grep.
  if (serverSet && !clientSet) {
    // eslint-disable-next-line no-console
    console.error(
      "[next.config] ADMIN_TOKEN is set but NEXT_PUBLIC_ADMIN_TOKEN is not — " +
        "canvas will 401 against workspace-server because the bearer header " +
        "is never attached. Set both to the same value, or unset both.",
    );
  } else {
    // eslint-disable-next-line no-console
    console.error(
      "[next.config] NEXT_PUBLIC_ADMIN_TOKEN is set but ADMIN_TOKEN is not — " +
        "workspace-server will reject the bearer because no AdminAuth gate " +
        "is configured. Set both to the same value, or unset both.",
    );
  }
 }
 function findMonorepoRoot(start: string): string | null {
  let dir = start;
  for (let i = 0; i < 6; i++) {
--- a/canvas/package-lock.json
+++ b/canvas/package-lock.json
@ -119,6 +119,7 @@
      "integrity": "sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "@babel/helper-validator-identifier": "^7.28.5",
        "js-tokens": "^4.0.0",
@ -299,7 +300,6 @@
        }
      ],
      "license": "MIT",
      "peer": true,
      "engines": {
        "node": ">=20.19.0"
      },
@ -348,7 +348,6 @@
        }
      ],
      "license": "MIT",
      "peer": true,
      "engines": {
        "node": ">=20.19.0"
      }
@ -360,7 +359,6 @@
      "dev": true,
      "license": "MIT",
      "optional": true,
      "peer": true,
      "dependencies": {
        "@emnapi/wasi-threads": "1.2.1",
        "tslib": "^2.4.0"
@ -372,7 +370,6 @@
      "integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
      "license": "MIT",
      "optional": true,
      "peer": true,
      "dependencies": {
        "tslib": "^2.4.0"
      }
@ -1129,7 +1126,6 @@
      "integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==",
      "devOptional": true,
      "license": "Apache-2.0",
      "peer": true,
      "dependencies": {
        "playwright": "1.59.1"
      },
@ -2410,7 +2406,8 @@
      "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz",
      "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==",
      "dev": true,
-      "license": "MIT"
+      "license": "MIT",
      "peer": true
    },
    "node_modules/@types/chai": {
      "version": "5.2.3",
@ -2533,7 +2530,6 @@
      "integrity": "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "undici-types": "~7.19.0"
      }
@ -2543,7 +2539,6 @@
      "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.14.tgz",
      "integrity": "sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w==",
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "csstype": "^3.2.2"
      }
@ -2554,7 +2549,6 @@
      "integrity": "sha512-jp2L/eY6fn+KgVVQAOqYItbF0VY/YApe5Mz2F0aykSO8gx31bYCZyvSeYxCHKvzHG5eZjc+zyaS5BrBWya2+kQ==",
      "devOptional": true,
      "license": "MIT",
      "peer": true,
      "peerDependencies": {
        "@types/react": "^19.2.0"
      }
@ -2603,7 +2597,6 @@
      "integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "@bcoe/v8-coverage": "^1.0.2",
        "@vitest/utils": "4.1.5",
@ -2814,6 +2807,7 @@
      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "engines": {
        "node": ">=8"
      }
@ -2824,6 +2818,7 @@
      "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "engines": {
        "node": ">=10"
      },
@ -3116,7 +3111,6 @@
      "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz",
      "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
      "license": "ISC",
      "peer": true,
      "engines": {
        "node": ">=12"
      }
@ -3259,7 +3253,8 @@
      "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz",
      "integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==",
      "dev": true,
-      "license": "MIT"
+      "license": "MIT",
      "peer": true
    },
    "node_modules/enhanced-resolve": {
      "version": "5.21.0",
@ -3605,7 +3600,8 @@
      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
      "dev": true,
-      "license": "MIT"
+      "license": "MIT",
      "peer": true
    },
    "node_modules/jsdom": {
      "version": "29.1.1",
@ -3613,7 +3609,6 @@
      "integrity": "sha512-ECi4Fi2f7BdJtUKTflYRTiaMxIB0O6zfR1fX0GXpUrf6flp8QIYn1UT20YQqdSOfk2dfkCwS8LAFoJDEppNK5Q==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "@asamuzakjp/css-color": "^5.1.11",
        "@asamuzakjp/dom-selector": "^7.1.1",
@ -3936,6 +3931,7 @@
      "integrity": "sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "bin": {
        "lz-string": "bin/bin.js"
      }
@ -5010,7 +5006,6 @@
      "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "engines": {
        "node": ">=12"
      },
@ -5098,6 +5093,7 @@
      "integrity": "sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "ansi-regex": "^5.0.1",
        "ansi-styles": "^5.0.0",
@ -5132,7 +5128,6 @@
      "resolved": "https://registry.npmjs.org/react/-/react-19.2.5.tgz",
      "integrity": "sha512-llUJLzz1zTUBrskt2pwZgLq59AemifIftw4aB7JxOqf1HY2FDaGDxgwpAPVzHU1kdWabH7FauP4i1oEeer2WCA==",
      "license": "MIT",
      "peer": true,
      "engines": {
        "node": ">=0.10.0"
      }
@ -5142,7 +5137,6 @@
      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.5.tgz",
      "integrity": "sha512-J5bAZz+DXMMwW/wV3xzKke59Af6CHY7G4uYLN1OvBcKEsWOs4pQExj86BBKamxl/Ik5bx9whOrvBlSDfWzgSag==",
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "scheduler": "^0.27.0"
      },
@ -5155,7 +5149,8 @@
      "resolved": "https://registry.npmjs.org/react-is/-/react-is-17.0.2.tgz",
      "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
      "dev": true,
-      "license": "MIT"
+      "license": "MIT",
      "peer": true
    },
    "node_modules/react-markdown": {
      "version": "10.1.0",
@ -5603,8 +5598,7 @@
      "version": "4.2.4",
      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.2.4.tgz",
      "integrity": "sha512-HhKppgO81FQof5m6TEnuBWCZGgfRAWbaeOaGT00KOy/Pf/j6oUihdvBpA7ltCeAvZpFhW3j0PTclkxsd4IXYDA==",
-      "license": "MIT",
+      "license": "MIT"
      "peer": true
    },
    "node_modules/tapable": {
      "version": "2.3.3",
@ -5946,7 +5940,6 @@
      "integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "lightningcss": "^1.32.0",
        "picomatch": "^4.0.4",
@ -6040,7 +6033,6 @@
      "integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==",
      "dev": true,
      "license": "MIT",
      "peer": true,
      "dependencies": {
        "@vitest/expect": "4.1.5",
        "@vitest/mocker": "4.1.5",
--- a/canvas/src/app/globals.css
+++ b/canvas/src/app/globals.css
@ -274,4 +274,17 @@ body {
  .react-flow__node {
    animation: none !important;
  }
  /* React Flow Controls toolbar buttons — WCAG 2.4.7 focus-visible */
  .react-flow__controls button:focus-visible {
    outline: 2px solid var(--accent, #3b5bdb);
    outline-offset: 2px;
  }
  /* React Flow Minimap nodes — WCAG 2.4.7 focus-visible */
  .react-flow__minimap:focus-visible,
  .react-flow__minimap svg:focus-visible {
    outline: 2px solid var(--accent, #3b5bdb);
    outline-offset: 2px;
  }
 }
--- a/canvas/src/app/layout.tsx
+++ b/canvas/src/app/layout.tsx
@ -1,6 +1,22 @@
 import type { Metadata } from "next";
 import { Inter, JetBrains_Mono } from "next/font/google";
 import { cookies, headers } from "next/headers";
 import "./globals.css";
 // Self-hosted at build time → CSP-safe (font-src 'self' covers them
 // because Next.js serves the .woff2 from /_next/static). Exposed as
 // CSS variables so the mobile palette can reference them without
 // importing this module.
 const interFont = Inter({
  subsets: ["latin"],
  display: "swap",
  variable: "--font-inter",
 });
 const monoFont = JetBrains_Mono({
  subsets: ["latin"],
  display: "swap",
  variable: "--font-jetbrains",
 });
 import { AuthGate } from "@/components/AuthGate";
 import { CookieConsent } from "@/components/CookieConsent";
 import { PurchaseSuccessModal } from "@/components/PurchaseSuccessModal";
@ -79,7 +95,7 @@ export default async function RootLayout({
          dangerouslySetInnerHTML={{ __html: themeBootScript }}
        />
      </head>
-      <body className="bg-surface text-ink">
+      <body className={`bg-surface text-ink ${interFont.variable} ${monoFont.variable}`}>
        <ThemeProvider initialTheme={theme}>
          {/* AuthGate is a client component; it checks the session on mount
              and bounces anonymous users to the control plane's login page
--- a/canvas/src/app/orgs/page.tsx
+++ b/canvas/src/app/orgs/page.tsx
@ -354,7 +354,7 @@ function OrgCTA({ org }: { org: Org }) {
    );
  }
  // provisioning / unknown — non-interactive
-  return <span className="text-sm text-ink-soft">{org.status}…</span>;
+  return <span className="text-sm text-ink-mid">{org.status}…</span>;
 }
 function EmptyState({ banner }: { banner?: React.ReactNode }) {
@ -420,7 +420,7 @@ function CreateOrgForm({ onCreated }: { onCreated: (slug: string) => void }) {
          aria-describedby="org-slug-hint"
          className="mt-1 w-full rounded border border-line bg-surface-card px-3 py-2 text-sm text-ink"
        />
-        <p id="org-slug-hint" className="mt-1 text-xs text-ink-soft">
+        <p id="org-slug-hint" className="mt-1 text-xs text-ink-mid">
          Lowercase letters, numbers, and hyphens only. Cannot be changed later.
        </p>
      </div>
--- a/canvas/src/app/page.tsx
+++ b/canvas/src/app/page.tsx
@ -4,6 +4,7 @@ import { useEffect, useState } from "react";
 import { Canvas } from "@/components/Canvas";
 import { Legend } from "@/components/Legend";
 import { CommunicationOverlay } from "@/components/CommunicationOverlay";
 import { MobileApp } from "@/components/mobile/MobileApp";
 import { Spinner } from "@/components/Spinner";
 import { connectSocket, disconnectSocket } from "@/store/socket";
 import { useCanvasStore } from "@/store/canvas";
@ -14,6 +15,23 @@ export default function Home() {
  const hydrationError = useCanvasStore((s) => s.hydrationError);
  const setHydrationError = useCanvasStore((s) => s.setHydrationError);
  const [hydrating, setHydrating] = useState(true);
  // < 640px viewport renders the dedicated mobile shell instead of the
  // desktop canvas. Tri-state: `null` until matchMedia has resolved,
  // then `true|false`. While null we keep the existing loading spinner
  // up — that way mobile devices never flash the desktop tree (which
  // they would if we defaulted to `false` and only flipped post-mount).
  const [isMobile, setIsMobile] = useState<boolean | null>(null);
  useEffect(() => {
    if (typeof window === "undefined" || !window.matchMedia) {
      setIsMobile(false);
      return;
    }
    const mq = window.matchMedia("(max-width: 639px)");
    const update = () => setIsMobile(mq.matches);
    update();
    mq.addEventListener("change", update);
    return () => mq.removeEventListener("change", update);
  }, []);
  // Distinct from hydrationError: platform-down is its own UX path
  // (different copy, different action — the user's next step is to
  // check local services, not to retry the API call). Tracked
@ -51,12 +69,15 @@ export default function Home() {
    };
  }, []);
-  if (hydrating) {
+  // Hold the spinner while data hydrates OR while the viewport
  // resolution hasn't settled yet (avoids a desktop-tree flash on
  // mobile devices between SSR-paint and matchMedia).
  if (hydrating || isMobile === null) {
    return (
      <div className="fixed inset-0 flex items-center justify-center bg-surface">
        <div role="status" aria-live="polite" className="flex flex-col items-center gap-3">
          <Spinner size="lg" />
-          <span className="text-xs text-ink-soft">Loading canvas...</span>
+          <span className="text-xs text-ink-mid">Loading canvas...</span>
        </div>
      </div>
    );
@ -66,6 +87,32 @@ export default function Home() {
    return <PlatformDownDiagnostic />;
  }
  if (isMobile) {
    return (
      <>
        <MobileApp />
        {hydrationError && (
          <div
            role="alert"
            data-testid="hydration-error"
            className="fixed inset-0 flex flex-col items-center justify-center bg-surface text-ink-mid gap-4 z-[9999] px-6"
          >
            <p className="text-ink-mid text-sm text-center">{hydrationError}</p>
            <button
              onClick={() => {
                setHydrationError(null);
                window.location.reload();
              }}
              className="px-4 py-2 bg-accent-strong hover:bg-accent text-white rounded-md text-sm"
            >
              Retry
            </button>
          </div>
        )}
      </>
    );
  }
  return (
    <>
      <Canvas />
@ -119,11 +166,11 @@ function PlatformDownDiagnostic() {
        Most common cause on a dev host: one of those services stopped.
      </p>
      <div className="bg-surface-sunken/80 border border-line/50 rounded-lg px-4 py-3 max-w-lg w-full">
-        <div className="text-[10px] uppercase tracking-wider text-ink-soft mb-2">Try first</div>
+        <div className="text-[10px] uppercase tracking-wider text-ink-mid mb-2">Try first</div>
        <pre className="text-[12px] text-ink-mid font-mono whitespace-pre-wrap leading-relaxed">{`brew services start postgresql@14
 brew services start redis`}</pre>
      </div>
-      <p className="text-[11px] text-ink-soft max-w-lg text-center">
+      <p className="text-[11px] text-ink-mid max-w-lg text-center">
        If both are running, check <code className="font-mono">/tmp/molecule-server.log</code> for
        the underlying error. If you&apos;re on hosted SaaS, this is a platform incident — try again in a moment.
      </p>
--- a/canvas/src/app/pricing/page.tsx
+++ b/canvas/src/app/pricing/page.tsx
@ -55,13 +55,13 @@ export default function PricingPage() {
          </a>
          .
        </p>
-        <p className="mt-6 text-sm text-ink-soft">
+        <p className="mt-6 text-sm text-ink-mid">
          Prices shown in USD. Flat-rate per org — no per-seat fees on any paid tier.
          Enterprise / self-hosted licensing available — contact us.
        </p>
      </section>
-      <footer className="mx-auto mt-20 max-w-5xl border-t border-line px-6 py-6 text-center text-sm text-ink-soft">
+      <footer className="mx-auto mt-20 max-w-5xl border-t border-line px-6 py-6 text-center text-sm text-ink-mid">
        <p>
          © {new Date().getFullYear()} Molecule AI, Inc. ·{" "}
          <a href="/legal/terms" className="hover:text-ink-mid">
--- a/canvas/src/components/AuditTrailPanel.tsx
+++ b/canvas/src/components/AuditTrailPanel.tsx
@ -127,7 +127,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
  if (loading) {
    return (
      <div className="flex items-center justify-center h-32">
-        <span className="text-xs text-ink-soft">Loading audit trail…</span>
+        <span className="text-xs text-ink-mid">Loading audit trail…</span>
      </div>
    );
  }
@ -142,10 +142,10 @@ export function AuditTrailPanel({ workspaceId }: Props) {
            key={f.id}
            onClick={() => setFilter(f.id)}
            aria-pressed={filter === f.id}
-            className={`px-2 py-1 text-[10px] rounded-md font-medium transition-all shrink-0 ${
+            className={`px-2 py-1 text-[10px] rounded-md font-medium transition-all shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface ${
              filter === f.id
                ? "bg-surface-card text-ink ring-1 ring-zinc-600"
-                : "text-ink-soft hover:text-ink-mid hover:bg-surface-card/60"
+                : "text-ink-mid hover:text-ink-mid hover:bg-surface-card/60"
            }`}
          >
            {f.label}
@ -155,7 +155,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
        <button
          type="button"
          onClick={loadEntries}
-          className="px-2 py-1 text-[10px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors shrink-0"
+          className="px-2 py-1 text-[10px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
          aria-label="Refresh audit trail"
        >
          ↻
@ -174,9 +174,9 @@ export function AuditTrailPanel({ workspaceId }: Props) {
        {entries.length === 0 ? (
          /* Empty state */
          <div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
-            <span className="text-4xl text-ink-soft" aria-hidden="true">⊟</span>
+            <span className="text-4xl text-ink-mid" aria-hidden="true">⊟</span>
            <p className="text-sm font-medium text-ink-mid">No audit events yet</p>
-            <p className="text-[11px] text-ink-soft max-w-[200px] leading-relaxed">
+            <p className="text-[11px] text-ink-mid max-w-[200px] leading-relaxed">
              Delegation, decision, gate, and human-in-the-loop events will appear here.
            </p>
          </div>
@ -195,7 +195,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
                  type="button"
                  onClick={loadMore}
                  disabled={loadingMore}
-                  className="px-4 py-2 text-[11px] bg-surface-card hover:bg-surface-card disabled:opacity-50 disabled:cursor-not-allowed text-ink-mid rounded-lg transition-colors"
+                  className="px-4 py-2 text-[11px] bg-surface-card hover:bg-surface-card disabled:opacity-50 disabled:cursor-not-allowed text-ink-mid rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
                >
                  {loadingMore ? "Loading…" : "Load more"}
                </button>
@ -203,7 +203,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
            )}
            {/* Entry count footer */}
-            <p className="mt-3 text-center text-[9px] text-ink-soft">
+            <p className="mt-3 text-center text-[9px] text-ink-mid">
              {entries.length} event{entries.length !== 1 ? "s" : ""} loaded
              {cursor ? " · more available" : " · all loaded"}
            </p>
@ -265,7 +265,7 @@ export function AuditEntryRow({ entry, now }: AuditEntryRowProps) {
        )}
        {/* Relative timestamp */}
-        <span className="shrink-0 text-[9px] text-ink-soft">
+        <span className="shrink-0 text-[9px] text-ink-mid">
          {formatAuditRelativeTime(entry.created_at, now)}
        </span>
      </div>
--- a/canvas/src/components/BundleDropZone.tsx
+++ b/canvas/src/components/BundleDropZone.tsx
@ -43,7 +43,9 @@ export function BundleDropZone() {
  const handleDragOver = useCallback((e: React.DragEvent) => {
    e.preventDefault();
    e.stopPropagation();
-    if (e.dataTransfer.types.includes("Files")) {
+    // Guard against jsdom (no File API / dataTransfer.types) and other
    // environments where dataTransfer may be null/undefined.
    if (e.dataTransfer?.types?.includes("Files")) {
      setIsDragging(true);
    }
  }, []);
@ -58,6 +60,7 @@ export function BundleDropZone() {
    e.preventDefault();
    e.stopPropagation();
    setIsDragging(false);
    if (!e.dataTransfer?.files?.length) return;
    const file = Array.from(e.dataTransfer.files).find(
      (f) => f.name.endsWith(".bundle.json")
    );
@ -125,7 +128,7 @@ export function BundleDropZone() {
          <div className="bg-surface-sunken/95 border border-accent/50 rounded-2xl px-8 py-6 shadow-2xl text-center">
            <div className="text-3xl mb-2" aria-hidden="true">📦</div>
            <div className="text-sm font-semibold text-ink">Drop Bundle to Import</div>
-            <div className="text-xs text-ink-soft mt-1">.bundle.json files only</div>
+            <div className="text-xs text-ink-mid mt-1">.bundle.json files only</div>
          </div>
        </div>
      )}
--- a/canvas/src/components/Canvas.tsx
+++ b/canvas/src/components/Canvas.tsx
@ -1,6 +1,6 @@
 "use client";
-import { useCallback, useMemo } from "react";
+import { useCallback, useEffect, useMemo, useRef } from "react";
 import {
  ReactFlow,
  ReactFlowProvider,
@ -187,6 +187,23 @@ function CanvasInner() {
  // Pan-to-node / zoom-to-team CustomEvent listeners + viewport save.
  const { onMoveEnd } = useCanvasViewport();
  // Screen-reader announcements — read liveAnnouncement from the store and
  // immediately clear it so the same announcement doesn't re-fire on
  // re-render. Using a ref avoids a setState loop while keeping the
  // effect reactive to new announcement strings.
  const liveAnnouncement = useCanvasStore((s) => s.liveAnnouncement);
  const clearAnnouncement = useCanvasStore((s) => s.setLiveAnnouncement);
  const prevAnnouncement = useRef("");
  useEffect(() => {
    if (liveAnnouncement && liveAnnouncement !== prevAnnouncement.current) {
      prevAnnouncement.current = liveAnnouncement;
      // Small delay so the DOM update lands before clearing, giving
      // screen readers time to pick up the new text.
      const timer = setTimeout(() => clearAnnouncement(""), 500);
      return () => clearTimeout(timer);
    }
  }, [liveAnnouncement, clearAnnouncement]);
  // Delete-confirmation lives in the store so the dialog survives ContextMenu
  // unmounting — the prior local-in-ContextMenu state raced with the menu's
  // outside-click handler.
@ -291,7 +308,9 @@ function CanvasInner() {
            showInteractive={false}
          />
          <MiniMap
-            className="!bg-surface-sunken/90 !border-line/50 !rounded-lg !shadow-xl !shadow-black/20"
+            // hidden < sm: minimap eats ~30% of a phone screen and
            // overlaps with the New Workspace FAB at bottom-right.
            className="!bg-surface-sunken/90 !border-line/50 !rounded-lg !shadow-xl !shadow-black/20 !hidden sm:!block"
            // Mask dims off-viewport areas; tint matches the surface so
            // the dimming doesn't show as a black bar in light mode.
            maskColor={resolvedTheme === "dark" ? "rgba(0, 0, 0, 0.7)" : "rgba(232, 226, 211, 0.7)"}
@ -326,11 +345,21 @@ function CanvasInner() {
          <DropTargetBadge />
        </ReactFlow>
-        {/* Screen-reader live region: announces workspace count on canvas load or change */}
+        {/* Screen-reader live region — announces workspace count on initial load and
-        <div role="status" aria-live="polite" className="sr-only">
+            live status updates from WebSocket events (online, offline, provisioning, etc.).
-          {nodes.filter((n) => !n.parentId).length === 0
+            The liveAnnouncement text is cleared after the screen reader has had time
            to read it so the same message doesn't re-announce on re-render. */}
        <div
          role="status"
          aria-live="polite"
          aria-atomic="true"
          className="sr-only"
        >
          {liveAnnouncement || (
            nodes.filter((n) => !n.parentId).length === 0
              ? "No workspaces on canvas"
-            : `${nodes.filter((n) => !n.parentId).length} workspace${nodes.filter((n) => !n.parentId).length !== 1 ? "s" : ""} on canvas`}
+              : `${nodes.filter((n) => !n.parentId).length} workspace${nodes.filter((n) => !n.parentId).length !== 1 ? "s" : ""} on canvas`
          )}
        </div>
        {nodes.length === 0 && <EmptyState />}
--- a/canvas/src/components/CommunicationOverlay.tsx
+++ b/canvas/src/components/CommunicationOverlay.tsx
@ -209,7 +209,7 @@ export function CommunicationOverlay() {
        type="button"
        onClick={() => setVisible(true)}
        aria-label="Show communications panel"
-        className="fixed top-16 right-4 z-30 px-3 py-1.5 bg-surface-sunken/90 border border-line/50 rounded-lg text-[10px] text-ink-mid hover:text-ink transition-colors"
+        className="fixed top-16 right-4 z-30 px-3 py-1.5 bg-surface-sunken/90 border border-line/50 rounded-lg text-[10px] text-ink-mid hover:text-ink transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
      >
        <span aria-hidden="true">↗↙ </span>{comms.length > 0 ? `${comms.length} comms` : "Communications"}
      </button>
@ -226,7 +226,7 @@ export function CommunicationOverlay() {
          type="button"
          onClick={() => setVisible(false)}
          aria-label="Close communications panel"
-          className="text-ink-soft hover:text-ink-mid text-xs"
+          className="text-ink-mid hover:text-ink-mid text-xs focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
        >
          <span aria-hidden="true">✕</span>
        </button>
@ -268,7 +268,7 @@ export function CommunicationOverlay() {
                </div>
              </div>
              {c.summary && (
-                <div className="text-ink-soft truncate mt-0.5 pl-4">{c.summary}</div>
+                <div className="text-ink-mid truncate mt-0.5 pl-4">{c.summary}</div>
              )}
              {c.durationMs && (
                <div className="text-ink-mid pl-4">{c.durationMs}ms</div>
--- a/canvas/src/components/ConfirmDialog.tsx
+++ b/canvas/src/components/ConfirmDialog.tsx
@ -105,8 +105,12 @@ export function ConfirmDialog({
  // (e.g. parents with transform, filter, will-change that break position:fixed).
  return createPortal(
    <div className="fixed inset-0 z-[9999] flex items-center justify-center">
-      {/* Backdrop */}
+      {/* Backdrop — interactive dismiss area; accessible name for screen readers (WCAG 4.1.2) */}
-      <div className="absolute inset-0 bg-black/60 backdrop-blur-sm" onClick={onCancel} />
+      <div
        className="absolute inset-0 bg-black/60 backdrop-blur-sm cursor-pointer"
        aria-label="Dismiss dialog"
        onClick={onCancel}
      />
      {/* Dialog — role="dialog" + aria-modal prevent interaction with background */}
      <div
--- a/canvas/src/components/ConsoleModal.tsx
+++ b/canvas/src/components/ConsoleModal.tsx
@ -90,7 +90,11 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
  return createPortal(
    <div className="fixed inset-0 z-[9999] flex items-center justify-center">
-      <div aria-hidden="true" className="absolute inset-0 bg-black/70 backdrop-blur-sm" onClick={onClose} />
+      <div
        className="absolute inset-0 bg-black/70 backdrop-blur-sm cursor-pointer"
        onClick={onClose}
        aria-label="Close terminal"
      />
      <div
        role="dialog"
        aria-modal="true"
@ -103,7 +107,7 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
              EC2 console output
            </h3>
            {workspaceName && (
-              <div className="text-[11px] text-ink-soft mt-0.5 truncate max-w-[600px]">
+              <div className="text-[11px] text-ink-mid mt-0.5 truncate max-w-[600px]">
                {workspaceName}
              </div>
            )}
@ -124,7 +128,7 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
        <div className="flex-1 overflow-auto bg-black/80 p-4">
          {loading && (
-            <div className="text-[12px] text-ink-soft" data-testid="console-loading">
+            <div className="text-[12px] text-ink-mid" data-testid="console-loading">
              Loading console output…
            </div>
          )}
@ -165,7 +169,7 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
                  showToast("Copy requires HTTPS — please select and copy manually", "info");
                }
              }}
-              className="px-3 py-1.5 text-[11px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-elevated border border-line hover:border-line-soft rounded-lg transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 focus-visible:ring-offset-2 focus-visible:ring-offset-surface"
+              className="px-3 py-1.5 text-[11px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-elevated border border-line hover:border-line-soft rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
            >
              Copy
            </button>
--- a/canvas/src/components/ContextMenu.tsx
+++ b/canvas/src/components/ContextMenu.tsx
@ -311,7 +311,7 @@ export function ContextMenu() {
            aria-hidden="true"
            className={`w-1.5 h-1.5 rounded-full ${statusDotClass(contextMenu.nodeData.status)}`}
          />
-          <span className="text-[10px] text-ink-soft">{contextMenu.nodeData.status}</span>
+          <span className="text-[10px] text-ink-mid">{contextMenu.nodeData.status}</span>
        </div>
      </div>
--- a/canvas/src/components/ConversationTraceModal.tsx
+++ b/canvas/src/components/ConversationTraceModal.tsx
@ -13,7 +13,8 @@ interface Props {
  onClose: () => void;
 }
-function extractMessageText(body: Record<string, unknown> | null): string {
+/** Exported for unit testing — see ConversationTraceModal.test.ts */
 export function extractMessageText(body: Record<string, unknown> | null): string {
  if (!body) return "";
  try {
    // Simple task format from MCP server: {task: "..."}
@ -106,7 +107,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
                <Dialog.Title className="text-sm font-semibold text-ink">
                  Conversation Trace
                </Dialog.Title>
-                <p className="text-[10px] text-ink-soft mt-0.5">
+                <p className="text-[10px] text-ink-mid mt-0.5">
                  {entries.length} events across all workspaces
                </p>
              </div>
@ -114,7 +115,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
                <button
                  type="button"
                  aria-label="Close conversation trace"
-                  className="text-ink-soft hover:text-ink-mid text-lg px-2"
+                  className="text-ink-mid hover:text-ink-mid text-lg px-2 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
                >
                  ✕
                </button>
@ -124,13 +125,13 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
            {/* Timeline */}
            <div className="flex-1 overflow-y-auto px-5 py-4">
              {loading && (
-                <div className="text-xs text-ink-soft text-center py-8">
+                <div className="text-xs text-ink-mid text-center py-8">
                  Loading trace from all workspaces...
                </div>
              )}
              {!loading && entries.length === 0 && (
-                <div className="text-xs text-ink-soft text-center py-8">
+                <div className="text-xs text-ink-mid text-center py-8">
                  No activity found
                </div>
              )}
@ -250,7 +251,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
                          {/* Message content — show request and/or response */}
                          {requestText && (
                            <div className="mt-1.5 bg-surface/60 border border-line/50 rounded-lg px-3 py-2 max-h-32 overflow-y-auto">
-                              <div className="text-[8px] text-ink-soft uppercase mb-1">
+                              <div className="text-[8px] text-ink-mid uppercase mb-1">
                                {isSend ? "Task" : "Request"}
                              </div>
                              <div className="text-[10px] text-ink-mid whitespace-pre-wrap break-words leading-relaxed">
@ -285,7 +286,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
              <Dialog.Close asChild>
                <button
                  type="button"
-                  className="px-4 py-1.5 text-[12px] bg-surface-card hover:bg-surface-card text-ink-mid rounded-lg transition-colors"
+                  className="px-4 py-1.5 text-[12px] bg-surface-card hover:bg-surface-card text-ink-mid rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
                >
                  Close
                </button>
--- a/canvas/src/components/CreateWorkspaceDialog.tsx
+++ b/canvas/src/components/CreateWorkspaceDialog.tsx
@ -338,7 +338,7 @@ export function CreateWorkspaceButton() {
          <Dialog.Title className="text-base font-semibold text-ink mb-1">
            Create Workspace
          </Dialog.Title>
-          <p className="text-xs text-ink-soft mb-5">
+          <p className="text-xs text-ink-mid mb-5">
            Add a new workspace node to the canvas
          </p>
@ -376,7 +376,7 @@ export function CreateWorkspaceButton() {
              />
              <div className="text-xs">
                <div className="text-ink font-medium">External agent (bring your own compute)</div>
-                <div className="text-ink-soft mt-0.5">
+                <div className="text-ink-mid mt-0.5">
                  Skip the container. We&apos;ll return a workspace_id + auth token + ready-to-paste snippet so an agent running on your laptop / server / CI can register via A2A.
                </div>
              </div>
@ -411,7 +411,7 @@ export function CreateWorkspaceButton() {
                    tabIndex={tier === t.value ? 0 : -1}
                    onClick={() => setTier(t.value)}
                    onKeyDown={(e) => handleRadioKeyDown(e, idx)}
-                    className={`py-2 rounded-lg text-center transition-colors ${
+                    className={`py-2 rounded-lg text-center transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 ${
                      tier === t.value
                        ? "bg-accent-strong/20 border border-accent/50 text-accent"
                        : "bg-surface-card/60 border border-line/40 text-ink-mid hover:text-ink-mid hover:border-line"
@ -456,7 +456,7 @@ export function CreateWorkspaceButton() {
              <p className="text-[11px] font-semibold text-violet-400 uppercase tracking-wide">
                Hermes Provider
              </p>
-              <p className="text-[11px] text-ink-soft -mt-1">
+              <p className="text-[11px] text-ink-mid -mt-1">
                Choose the AI provider and paste your API key. The key is
                stored as an encrypted workspace secret.
              </p>
@ -534,7 +534,7 @@ export function CreateWorkspaceButton() {
                    (m) => <option key={m} value={m} />,
                  )}
                </datalist>
-                <p className="text-[10px] text-ink-soft mt-1">
+                <p className="text-[10px] text-ink-mid mt-1">
                  Slug determines which provider hermes routes to at install time.
                </p>
              </div>
@ -626,7 +626,7 @@ function InputField({
        className={`w-full bg-surface-card/60 border border-line/50 rounded-lg px-3 py-2 text-sm text-ink placeholder-ink-soft focus:outline-none focus:border-accent/60 focus:ring-1 focus:ring-accent/20 transition-colors ${mono ? "font-mono text-xs" : ""}`}
      />
      {helper && (
-        <p className="mt-1 text-xs text-ink-soft">{helper}</p>
+        <p className="mt-1 text-xs text-ink-mid">{helper}</p>
      )}
    </div>
  );
--- a/canvas/src/components/DeleteCascadeConfirmDialog.tsx
+++ b/canvas/src/components/DeleteCascadeConfirmDialog.tsx
@ -81,7 +81,11 @@ export function DeleteCascadeConfirmDialog({
  return createPortal(
    <div className="fixed inset-0 z-[9999] flex items-center justify-center">
      {/* Backdrop */}
-      <div aria-hidden="true" className="absolute inset-0 bg-black/60 backdrop-blur-sm" onClick={onCancel} />
+      <div
        className="absolute inset-0 bg-black/60 backdrop-blur-sm cursor-pointer"
        onClick={onCancel}
        aria-label="Dismiss dialog"
      />
      {/* Dialog */}
      <div
--- a/canvas/src/components/EmptyState.tsx
+++ b/canvas/src/components/EmptyState.tsx
@ -129,11 +129,11 @@ export function EmptyState() {
                      T{t.tier}
                    </span>
                  </div>
-                  <p className="text-[11px] text-ink-soft line-clamp-2 leading-relaxed">
+                  <p className="text-[11px] text-ink-mid line-clamp-2 leading-relaxed">
                    {t.description || "No description"}
                  </p>
                  {t.skill_count > 0 && (
-                    <p className="text-[9px] text-ink-soft mt-1.5">
+                    <p className="text-[9px] text-ink-mid mt-1.5">
                      {t.skill_count} skill{t.skill_count !== 1 ? "s" : ""}
                      {t.model ? ` · ${t.model}` : ""}
                    </p>
@ -174,10 +174,10 @@ export function EmptyState() {
        <div className="mt-5 pt-4 border-t border-line/50">
          <div className="flex items-center justify-center gap-6 text-[10px] text-ink-mid">
            <span>Drag to nest workspaces into teams</span>
-            <span className="text-ink-soft">|</span>
+            <span className="text-ink-mid">|</span>
            <span>Right-click for actions</span>
-            <span className="text-ink-soft">|</span>
+            <span className="text-ink-mid">|</span>
-            <span>Press <kbd className="px-1 py-0.5 bg-surface-card rounded text-ink-soft font-mono">&#8984;K</kbd> to search</span>
+            <span>Press <kbd className="px-1 py-0.5 bg-surface-card rounded text-ink-mid font-mono">&#8984;K</kbd> to search</span>
          </div>
        </div>
      </div>
--- a/canvas/src/components/ErrorBoundary.tsx
+++ b/canvas/src/components/ErrorBoundary.tsx
@ -83,7 +83,7 @@ export class ErrorBoundary extends React.Component<
              <button
                type="button"
                onClick={this.handleReload}
-                className="rounded-lg bg-accent-strong hover:bg-accent px-5 py-2 text-sm font-medium text-white transition-colors"
+                className="rounded-lg bg-accent-strong hover:bg-accent px-5 py-2 text-sm font-medium text-white transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-2 focus-visible:ring-offset-surface"
              >
                Reload
              </button>
@ -93,7 +93,7 @@ export class ErrorBoundary extends React.Component<
                  e.preventDefault();
                  this.handleReport();
                }}
-                className="rounded-lg border border-line hover:border-line px-5 py-2 text-sm font-medium text-ink-mid hover:text-ink transition-colors"
+                className="rounded-lg border border-line hover:border-line px-5 py-2 text-sm font-medium text-ink-mid hover:text-ink transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-2 focus-visible:ring-offset-surface"
              >
                Report
              </a>
--- a/canvas/src/components/ExternalConnectModal.tsx
+++ b/canvas/src/components/ExternalConnectModal.tsx
@ -198,10 +198,10 @@ export function ExternalConnectModal({ info, onClose }: Props) {
                role="tab"
                aria-selected={tab === t}
                onClick={() => setTab(t)}
-                className={`px-3 py-2 text-sm border-b-2 -mb-px transition-colors ${
+                className={`px-3 py-2 text-sm border-b-2 -mb-px transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface ${
                  tab === t
                    ? "border-accent text-ink"
-                    : "border-transparent text-ink-soft hover:text-ink-mid"
+                    : "border-transparent text-ink-mid hover:text-ink-mid"
                }`}
              >
                {t === "claude"
@ -309,7 +309,7 @@ export function ExternalConnectModal({ info, onClose }: Props) {
            <button
              type="button"
              onClick={onClose}
-              className="px-4 py-2 text-sm rounded-lg bg-surface-card hover:bg-surface-card text-ink"
+              className="px-4 py-2 text-sm rounded-lg bg-surface-card hover:bg-surface-card text-ink focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
            >
              I&apos;ve saved it — close
            </button>
@ -335,11 +335,11 @@ function SnippetBlock({
  return (
    <div>
      <div className="flex items-center justify-between pb-1">
-        <span className="text-xs text-ink-soft">{label}</span>
+        <span className="text-xs text-ink-mid">{label}</span>
        <button
          type="button"
          onClick={onCopy}
-          className="text-xs px-2 py-1 rounded bg-accent-strong/80 hover:bg-accent text-white"
+          className="text-xs px-2 py-1 rounded bg-accent-strong/80 hover:bg-accent text-white focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
        >
          {copied ? "Copied!" : "Copy"}
        </button>
@ -366,7 +366,7 @@ function Field({
 }) {
  return (
    <div className="flex items-center gap-2">
-      <span className="text-xs text-ink-soft w-36 shrink-0">{label}</span>
+      <span className="text-xs text-ink-mid w-36 shrink-0">{label}</span>
      <code
        className={`flex-1 text-xs bg-surface border border-line rounded px-2 py-1 text-ink break-all ${mono ? "font-mono" : ""}`}
      >
@ -376,7 +376,7 @@ function Field({
        type="button"
        onClick={onCopy}
        disabled={!value}
-        className="text-xs px-2 py-1 rounded bg-surface-card hover:bg-surface-card text-ink disabled:opacity-40"
+        className="text-xs px-2 py-1 rounded bg-surface-card hover:bg-surface-card text-ink disabled:opacity-40 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
      >
        {copied ? "Copied!" : "Copy"}
      </button>
--- a/canvas/src/components/KeyboardShortcutsDialog.tsx
+++ b/canvas/src/components/KeyboardShortcutsDialog.tsx
@ -0,0 +1,236 @@
 "use client";
 import { useEffect, useRef, useState } from "react";
 import { createPortal } from "react-dom";
 interface ShortcutGroup {
  title: string;
  shortcuts: Array<{ keys: string[]; description: string }>;
 }
 const SHORTCUT_GROUPS: ShortcutGroup[] = [
  {
    title: "Canvas",
    shortcuts: [
      {
        keys: ["Esc"],
        description: "Close context menu, clear selection, or deselect",
      },
      {
        keys: ["↑↓←→"],
        description: "Nudge selected node 10px; hold Shift for 50px",
      },
      {
        keys: ["Cmd", "↑↓←→"],
        description: "Resize selected node (↑↓ height, ←→ width); hold Shift for fine control (2px)",
      },
      {
        keys: ["Enter"],
        description: "Descend into selected node's first child",
      },
      {
        keys: ["Shift", "Enter"],
        description: "Ascend to selected node's parent",
      },
      {
        keys: ["Cmd", "]"],
        description: "Bring selected node forward in z-order",
      },
      {
        keys: ["Cmd", "["],
        description: "Send selected node backward in z-order",
      },
      {
        keys: ["Z"],
        description: "Zoom to fit the selected team and its sub-workspaces",
      },
    ],
  },
  {
    title: "Navigation",
    shortcuts: [
      {
        keys: ["⌘K"],
        description: "Open workspace search",
      },
      {
        keys: ["Palette"],
        description: "Open the template palette to deploy a new workspace",
      },
      {
        keys: ["Dbl-click"],
        description: "Zoom canvas to fit a team node and all its sub-workspaces",
      },
      {
        keys: ["Right-click"],
        description: "Open the workspace context menu",
      },
    ],
  },
  {
    title: "Agent",
    shortcuts: [
      {
        keys: ["Chat"],
        description: "Send a message or resume a running task",
      },
      {
        keys: ["Config"],
        description: "Edit skills, model, secrets, and runtime settings",
      },
      {
        keys: ["Audit"],
        description: "View the activity ledger for the selected workspace",
      },
    ],
  },
 ];
 interface Props {
  open: boolean;
  onClose: () => void;
 }
 export function KeyboardShortcutsDialog({ open, onClose }: Props) {
  const dialogRef = useRef<HTMLDivElement>(null);
  const [mounted, setMounted] = useState(false);
  useEffect(() => {
    setMounted(true);
  }, []);
  // Move focus into the dialog when it opens (WCAG 2.1 SC 2.4.3)
  useEffect(() => {
    if (!open || !mounted) return;
    const raf = requestAnimationFrame(() => {
      dialogRef.current?.querySelector<HTMLElement>("button")?.focus();
    });
    return () => cancelAnimationFrame(raf);
  }, [open, mounted]);
  // Keyboard: Escape closes, Tab is trapped
  useEffect(() => {
    if (!open) return;
    const handler = (e: KeyboardEvent) => {
      if (e.key === "Escape") {
        onClose();
        return;
      }
      if (e.key === "Tab" && dialogRef.current) {
        const focusable = Array.from(
          dialogRef.current.querySelectorAll<HTMLElement>(
            'button, [href], input, select, textarea, [tabindex]:not([tabindex="-1"])'
          )
        ).filter((el) => !el.hasAttribute("disabled"));
        if (focusable.length === 0) {
          e.preventDefault();
          return;
        }
        const first = focusable[0];
        const last = focusable[focusable.length - 1];
        if (e.shiftKey) {
          if (document.activeElement === first) {
            e.preventDefault();
            last.focus();
          }
        } else {
          if (document.activeElement === last) {
            e.preventDefault();
            first.focus();
          }
        }
      }
    };
    window.addEventListener("keydown", handler);
    return () => window.removeEventListener("keydown", handler);
  }, [open, onClose]);
  if (!open || !mounted) return null;
  return createPortal(
    <div className="fixed inset-0 z-[9999] flex items-center justify-center">
      {/* Backdrop */}
      <div
        className="absolute inset-0 bg-black/60 backdrop-blur-sm cursor-pointer"
        onClick={onClose}
        aria-label="Close keyboard shortcuts dialog"
      />
      {/* Dialog */}
      <div
        ref={dialogRef}
        role="dialog"
        aria-modal="true"
        aria-labelledby="keyboard-shortcuts-title"
        className="relative bg-surface border border-line rounded-xl shadow-2xl shadow-black/60 max-w-[480px] w-full mx-4 overflow-hidden max-h-[80vh] flex flex-col"
      >
        {/* Header */}
        <div className="flex items-center justify-between px-5 py-4 border-b border-line shrink-0">
          <h2
            id="keyboard-shortcuts-title"
            className="text-sm font-semibold text-ink"
          >
            Keyboard Shortcuts
          </h2>
          <button
            type="button"
            onClick={onClose}
            aria-label="Close keyboard shortcuts"
            className="w-7 h-7 flex items-center justify-center rounded-lg text-ink-mid hover:text-ink hover:bg-surface-sunken transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/40"
          >
            ×
          </button>
        </div>
        {/* Content */}
        <div className="overflow-y-auto p-5 space-y-5">
          {SHORTCUT_GROUPS.map((group) => (
            <div key={group.title}>
              <h3 className="text-[10px] font-semibold uppercase tracking-[0.2em] text-ink-mid mb-2.5">
                {group.title}
              </h3>
              <div className="space-y-2">
                {group.shortcuts.map((shortcut, i) => (
                  <div
                    key={i}
                    className="flex items-center justify-between gap-4"
                  >
                    <span className="text-[13px] text-ink-mid">
                      {shortcut.description}
                    </span>
                    <kbd className="flex items-center gap-0.5 shrink-0">
                      {shortcut.keys.map((k, j) => (
                        <span key={j} className="flex items-center gap-0.5">
                          {j > 0 && (
                            <span className="text-[9px] text-ink-mid mx-0.5">
                              +
                            </span>
                          )}
                          <span className="inline-flex items-center rounded-md border border-line/70 bg-surface-sunken/70 px-2 py-0.5 text-[11px] font-medium text-ink tabular-nums font-mono">
                            {k}
                          </span>
                        </span>
                      ))}
                    </kbd>
                  </div>
                ))}
              </div>
            </div>
          ))}
        </div>
        {/* Footer */}
        <div className="px-5 py-3 border-t border-line bg-surface-sunken/30 shrink-0">
          <p className="text-[10px] text-ink-mid text-center">
            Press{" "}
            <kbd className="inline-flex items-center rounded border border-line/70 bg-surface-sunken/70 px-1.5 py-0.5 text-[10px] font-medium text-ink font-mono">
              Esc
            </kbd>{" "}
            to close
          </p>
        </div>
      </div>
    </div>,
    document.body
  );
 }
--- a/canvas/src/components/Legend.tsx
+++ b/canvas/src/components/Legend.tsx
@ -77,7 +77,7 @@ export function Legend() {
        onClick={openLegend}
        aria-label="Show legend"
        title="Show legend"
-        className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-surface-sunken/95 border border-line/50 px-3 py-1.5 text-[11px] font-semibold text-ink-mid uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-ink hover:border-line focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 focus-visible:ring-offset-2 focus-visible:ring-offset-surface transition-[left,colors] duration-200`}
+        className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-surface-sunken/95 border border-line/50 px-3 py-1.5 text-[11px] font-semibold text-ink-mid uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-ink hover:border-line focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-2 focus-visible:ring-offset-surface transition-[left,colors] duration-200`}
      >
        <span aria-hidden="true" className="text-[10px]">ⓘ</span>
        Legend
@ -86,7 +86,10 @@ export function Legend() {
  }
  return (
-    <div className={`fixed bottom-6 ${leftClass} z-30 bg-surface-sunken/95 border border-line/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}>
+    <div
      data-testid="legend-panel"
      className={`fixed bottom-6 ${leftClass} z-30 bg-surface-sunken/95 border border-line/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}
    >
      <div className="flex items-start justify-between mb-2">
        <div className="text-[11px] font-semibold text-ink-mid uppercase tracking-wider">Legend</div>
        <button
@ -97,7 +100,7 @@ export function Legend() {
          // 24×24 touch target (was ~10×16, well under WCAG 2.5.5 min).
          // Negative margin keeps the visual position the same as before
          // — only the hit area + focus ring are larger.
-          className="-mt-1.5 -mr-1.5 w-6 h-6 inline-flex items-center justify-center rounded text-[14px] leading-none text-ink-soft hover:text-ink hover:bg-surface-card/40 focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 transition-colors"
+          className="-mt-1.5 -mr-1.5 w-6 h-6 inline-flex items-center justify-center rounded text-[14px] leading-none text-ink-mid hover:text-ink hover:bg-surface-card/40 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 transition-colors"
        >
          ×
        </button>
@ -105,7 +108,7 @@ export function Legend() {
      {/* Status */}
      <div className="mb-2">
-        <div className="text-[11px] text-ink-soft font-medium mb-1">Status</div>
+        <div className="text-[11px] text-ink-mid font-medium mb-1">Status</div>
        <div className="flex flex-wrap gap-x-3 gap-y-1">
          {LEGEND_STATUSES.map((s) => (
            <StatusItem key={s} color={STATUS_CONFIG[s].dot} label={STATUS_CONFIG[s].label} />
@ -115,7 +118,7 @@ export function Legend() {
      {/* Tiers */}
      <div className="mb-2">
-        <div className="text-[11px] text-ink-soft font-medium mb-1">Tier</div>
+        <div className="text-[11px] text-ink-mid font-medium mb-1">Tier</div>
        <div className="flex flex-wrap gap-x-3 gap-y-1">
          {LEGEND_TIERS.map(({ tier, label }) => (
            <TierItem key={tier} tier={tier} label={label} color={TIER_CONFIG[tier].border} />
@ -125,7 +128,7 @@ export function Legend() {
      {/* Communication */}
      <div>
-        <div className="text-[11px] text-ink-soft font-medium mb-1">Communication</div>
+        <div className="text-[11px] text-ink-mid font-medium mb-1">Communication</div>
        <div className="flex flex-wrap gap-x-3 gap-y-1">
          <CommItem icon="↗" color="text-cyan-400" label="A2A Out" />
          <CommItem icon="↙" color="text-accent" label="A2A In" />
--- a/canvas/src/components/MemoryInspectorPanel.tsx
+++ b/canvas/src/components/MemoryInspectorPanel.tsx
@ -288,7 +288,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
  if (loading && entries.length === 0 && !error && !pluginUnavailable) {
    return (
      <div className="flex items-center justify-center h-32">
-        <span className="text-xs text-ink-soft">Loading memories…</span>
+        <span className="text-xs text-ink-mid">Loading memories…</span>
      </div>
    );
  }
@ -311,7 +311,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
      {/* Namespace dropdown */}
      <div className="px-4 pt-3 pb-2 border-b border-line/40 shrink-0 space-y-2">
        <div className="flex items-center gap-2">
-          <label htmlFor="namespace-dropdown" className="text-[10px] text-ink-soft shrink-0">
+          <label htmlFor="namespace-dropdown" className="text-[10px] text-ink-mid shrink-0">
            Namespace:
          </label>
          <select
@ -337,7 +337,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
            height="12"
            viewBox="0 0 16 16"
            fill="none"
-            className="absolute left-2.5 text-ink-soft pointer-events-none shrink-0"
+            className="absolute left-2.5 text-ink-mid pointer-events-none shrink-0"
            aria-hidden="true"
          >
            <circle cx="7" cy="7" r="4.5" stroke="currentColor" strokeWidth="1.5" />
@ -360,7 +360,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
                setDebouncedQuery('');
              }}
              aria-label="Clear search"
-              className="absolute right-2 text-ink-soft hover:text-ink transition-colors text-sm leading-none"
+              className="absolute right-2 text-ink-mid hover:text-ink transition-colors text-sm leading-none focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
            >
              ×
            </button>
@ -370,7 +370,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
      {/* Toolbar */}
      <div className="px-4 py-2.5 border-b border-line/40 flex items-center justify-between shrink-0">
-        <span className="text-[11px] text-ink-soft">
+        <span className="text-[11px] text-ink-mid">
          {debouncedQuery
            ? `${entries.length} result${entries.length !== 1 ? 's' : ''}`
            : entries.length === 1
@ -381,7 +381,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
          type="button"
          onClick={loadEntries}
          disabled={pluginUnavailable}
-          className="px-2 py-1 text-[11px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
+          className="px-2 py-1 text-[11px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors disabled:opacity-50 disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
          aria-label="Refresh memories"
        >
          ↻ Refresh
@ -446,11 +446,11 @@ function EmptyState({
    // mirror it so the operator sees both signals.
    return (
      <div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
-        <span className="text-4xl text-ink-soft" aria-hidden="true">
+        <span className="text-4xl text-ink-mid" aria-hidden="true">
          ◇
        </span>
        <p className="text-sm font-medium text-ink-mid">Memory plugin disabled</p>
-        <p className="text-[11px] text-ink-soft max-w-[220px] leading-relaxed">
+        <p className="text-[11px] text-ink-mid max-w-[220px] leading-relaxed">
          See banner above for the operator-side fix.
        </p>
      </div>
@ -459,11 +459,11 @@ function EmptyState({
  if (query) {
    return (
      <div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
-        <span className="text-4xl text-ink-soft" aria-hidden="true">
+        <span className="text-4xl text-ink-mid" aria-hidden="true">
          ◇
        </span>
        <p className="text-sm font-medium text-ink-mid">No memories match your search</p>
-        <p className="text-[11px] text-ink-soft max-w-[200px] leading-relaxed">
+        <p className="text-[11px] text-ink-mid max-w-[200px] leading-relaxed">
          Try a different query or clear the search.
        </p>
      </div>
@ -471,11 +471,11 @@ function EmptyState({
  }
  return (
    <div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
-      <span className="text-4xl text-ink-soft" aria-hidden="true">
+      <span className="text-4xl text-ink-mid" aria-hidden="true">
        ◇
      </span>
      <p className="text-sm font-medium text-ink-mid">No memories yet</p>
-      <p className="text-[11px] text-ink-soft max-w-[220px] leading-relaxed">
+      <p className="text-[11px] text-ink-mid max-w-[220px] leading-relaxed">
        Agents commit memories via MCP tools (commit_memory, commit_summary). They
        appear here once written.
      </p>
@ -515,7 +515,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
      {/* Header row */}
      <button
        type="button"
-        className="w-full flex items-center gap-2 px-3 py-2.5 text-left hover:bg-surface-card/30 transition-colors"
+        className="w-full flex items-center gap-2 px-3 py-2.5 text-left hover:bg-surface-card/30 transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
        onClick={() => setExpanded((prev) => !prev)}
        aria-expanded={expanded}
        aria-controls={bodyId}
@ -558,7 +558,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
        {/* Namespace tag */}
        <span
-          className="text-[9px] shrink-0 font-mono text-ink-soft truncate max-w-[100px]"
+          className="text-[9px] shrink-0 font-mono text-ink-mid truncate max-w-[100px]"
          title={entry.namespace}
        >
          {entry.namespace}
@ -598,10 +598,10 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
        )}
-        <span className="text-[9px] text-ink-soft shrink-0">
+        <span className="text-[9px] text-ink-mid shrink-0">
          {formatRelativeTime(entry.created_at)}
        </span>
-        <span className="text-[9px] text-ink-soft shrink-0" aria-hidden="true">
+        <span className="text-[9px] text-ink-mid shrink-0" aria-hidden="true">
          {expanded ? '▼' : '▶'}
        </span>
      </button>
@ -618,7 +618,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
            {entry.content}
          </pre>
          <div className="flex items-center justify-between gap-2">
-            <span className="text-[9px] text-ink-soft">
+            <span className="text-[9px] text-ink-mid">
              Created: {new Date(entry.created_at).toLocaleString()}
              {entry.expires_at && ` · Expires: ${new Date(entry.expires_at).toLocaleString()}`}
            </span>
@ -629,7 +629,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
                onDelete();
              }}
              aria-label="Forget memory"
-              className="text-[10px] px-2 py-0.5 bg-red-950/40 hover:bg-red-900/50 border border-red-900/30 rounded text-bad transition-colors shrink-0"
+              className="text-[10px] px-2 py-0.5 bg-red-950/40 hover:bg-red-900/50 border border-red-900/30 rounded text-bad transition-colors shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-red-400 focus-visible:ring-offset-1"
            >
              Forget
            </button>
--- a/canvas/src/components/MissingKeysModal.tsx
+++ b/canvas/src/components/MissingKeysModal.tsx
@ -421,7 +421,7 @@ function ProviderPickerModal({
                    <div className="text-[11px] text-ink-mid font-medium">
                      {getKeyLabel(entry.key)}
                    </div>
-                    <div className="text-[9px] font-mono text-ink-soft">{entry.key}</div>
+                    <div className="text-[9px] font-mono text-ink-mid">{entry.key}</div>
                  </div>
                  {entry.saved && (
                    <span className="text-[9px] text-good bg-emerald-900/30 px-1.5 py-0.5 rounded flex items-center gap-1">
@ -675,7 +675,7 @@ function AllKeysModal({
                  <div className="text-[11px] text-ink-mid font-medium">
                    {getKeyLabel(entry.key)}
                  </div>
-                  <div className="text-[9px] font-mono text-ink-soft">{entry.key}</div>
+                  <div className="text-[9px] font-mono text-ink-mid">{entry.key}</div>
                </div>
                {entry.saved && (
                  <span className="text-[9px] text-good bg-emerald-900/30 px-1.5 py-0.5 rounded flex items-center gap-1">
@ -706,7 +706,7 @@ function AllKeysModal({
                    type="button"
                    onClick={() => handleSaveKey(index)}
                    disabled={!entry.value.trim() || entry.saving}
-                    className="px-3 py-1.5 bg-accent-strong hover:bg-accent text-[11px] rounded text-white disabled:opacity-30 transition-colors shrink-0"
+                    className="px-3 py-1.5 bg-accent-strong hover:bg-accent text-[11px] rounded text-white disabled:opacity-30 transition-colors shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
                  >
                    {entry.saving ? "..." : "Save"}
                  </button>
@ -730,7 +730,7 @@ function AllKeysModal({
              <button
                type="button"
                onClick={onOpenSettings}
-                className="text-[11px] text-accent hover:text-accent transition-colors"
+                className="text-[11px] text-accent hover:text-accent transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
              >
                Open Settings Panel
              </button>
@ -740,7 +740,7 @@ function AllKeysModal({
            <button
              type="button"
              onClick={onCancel}
-              className="px-3.5 py-1.5 text-[12px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-card border border-line rounded-lg transition-colors"
+              className="px-3.5 py-1.5 text-[12px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-card border border-line rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
            >
              Cancel Deploy
            </button>
@ -748,7 +748,7 @@ function AllKeysModal({
              type="button"
              onClick={handleAddKeysAndDeploy}
              disabled={!allSaved || anySaving}
-              className="px-3.5 py-1.5 text-[12px] bg-accent-strong hover:bg-accent text-white rounded-lg transition-colors disabled:opacity-40"
+              className="px-3.5 py-1.5 text-[12px] bg-accent-strong hover:bg-accent text-white rounded-lg transition-colors disabled:opacity-40 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
            >
              {anySaving ? "Saving..." : allSaved ? "Deploy" : "Add Keys"}
            </button>
--- a/canvas/src/components/OnboardingWizard.tsx
+++ b/canvas/src/components/OnboardingWizard.tsx
@ -210,7 +210,7 @@ export function OnboardingWizard() {
              // Was hover:bg-surface-card on top of bg-surface-card —
              // silent no-op hover. Lift to surface-elevated, matching
              // the Cancel pattern in ConfirmDialog.
-              className="px-3 py-1.5 bg-surface-card hover:bg-surface-elevated hover:text-ink rounded-lg text-[11px] text-ink-mid transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/40 focus-visible:ring-offset-2 focus-visible:ring-offset-surface-sunken"
+              className="px-3 py-1.5 bg-surface-card hover:bg-surface-elevated hover:text-ink rounded-lg text-[11px] text-ink-mid transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
            >
              Next
            </button>
--- a/canvas/src/components/OrgImportPreflightModal.tsx
+++ b/canvas/src/components/OrgImportPreflightModal.tsx
@ -247,7 +247,7 @@ export function OrgImportPreflightModal({
          <h2 id="org-preflight-title" className="text-sm font-semibold text-ink">
            Deploy {orgName}
          </h2>
-          <p className="mt-0.5 text-[11px] text-ink-soft">
+          <p className="mt-0.5 text-[11px] text-ink-mid">
            {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}.
            Review the credentials needed before import.
          </p>
@ -308,7 +308,7 @@ export function OrgImportPreflightModal({
              type="button"
              onClick={onProceed}
              disabled={!canProceed}
-              className="px-4 py-1.5 text-[11px] font-semibold rounded bg-accent hover:bg-accent-strong text-white disabled:bg-surface-card disabled:text-white-soft disabled:cursor-not-allowed"
+              className="px-4 py-1.5 text-[11px] font-semibold rounded bg-accent hover:bg-accent-strong text-white disabled:bg-surface-card disabled:text-white-soft disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
            >
              Import
            </button>
@ -400,7 +400,7 @@ function StrictEnvRow({
    <li className="flex items-center gap-2 rounded bg-surface-sunken/70 border border-line px-2 py-1.5">
      <code
        className={`text-[11px] font-mono flex-1 ${
-          configured ? "text-ink-soft line-through" : "text-ink"
+          configured ? "text-ink-mid line-through" : "text-ink"
        }`}
      >
        {envKey}
@ -428,7 +428,7 @@ function StrictEnvRow({
            type="button"
            onClick={() => onSave(envKey)}
            disabled={d?.saving || !d?.value.trim()}
-            className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed"
+            className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
          >
            {d?.saving ? "…" : "Save"}
          </button>
@ -492,7 +492,7 @@ function AnyOfEnvGroup({
            >
              <code
                className={`text-[11px] font-mono flex-1 ${
-                  isConfigured ? "text-ink-soft line-through" : "text-ink"
+                  isConfigured ? "text-ink-mid line-through" : "text-ink"
                }`}
              >
                {m}
@ -520,7 +520,7 @@ function AnyOfEnvGroup({
                    type="button"
                    onClick={() => onSave(m)}
                    disabled={d?.saving || !d?.value.trim()}
-                    className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed"
+                    className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
                  >
                    {d?.saving ? "…" : "Save"}
                  </button>
--- a/Show More
+++ b/Show More