diff --git a/.gitea/ci-refire b/.gitea/ci-refire new file mode 100644 index 000000000..acfc66725 --- /dev/null +++ b/.gitea/ci-refire @@ -0,0 +1 @@ +refire:1778784369 diff --git a/.gitea/scripts/ci-required-drift.py b/.gitea/scripts/ci-required-drift.py index 9d4e60c8a..8de6de46c 100755 --- a/.gitea/scripts/ci-required-drift.py +++ b/.gitea/scripts/ci-required-drift.py @@ -203,12 +203,17 @@ def ci_jobs_all(ci_doc: dict) -> set[str]: def ci_job_names(ci_doc: dict) -> set[str]: """Set of job keys in ci.yml MINUS the sentinel itself MINUS jobs - whose `if:` gates on `github.event_name` (those are event-scoped - and can legitimately be `skipped` for a given trigger; if we - required them under the sentinel `needs:`, every PR-only job + whose `if:` gates on `github.event_name` or `github.ref` (those are + event-scoped and can legitimately be `skipped` for a given trigger; + if we required them under the sentinel `needs:`, every PR-only job would be `skipped` on push and the sentinel would interpret `skipped != success` as failure). RFC §4 spec. + `github.ref` is the companion gate for jobs that run only on direct + pushes to specific branches (e.g. `github.ref == 'refs/heads/main'`). + These never execute in a PR context, so flagging them as missing + from `all-required.needs:` is a false positive (mc#958 / mc#959). + Used for F1 (jobs missing from sentinel needs). NOT used for F1b (typos in needs) — see `ci_jobs_all` for that.""" jobs = ci_doc.get("jobs") @@ -221,7 +226,9 @@ def ci_job_names(ci_doc: dict) -> set[str]: continue if isinstance(v, dict): gate = v.get("if") - if isinstance(gate, str) and "github.event_name" in gate: + if isinstance(gate, str) and ( + "github.event_name" in gate or "github.ref" in gate + ): continue names.add(k) return names diff --git a/.gitea/scripts/gitea-merge-queue.py b/.gitea/scripts/gitea-merge-queue.py index ec7dc2fe9..964d8aa26 100644 --- a/.gitea/scripts/gitea-merge-queue.py +++ b/.gitea/scripts/gitea-merge-queue.py @@ -65,6 +65,11 @@ class ApiError(RuntimeError): pass +class MergePermissionError(ApiError): + """Merge failed with a permanent permission error (403/404/405). + The queue should skip this PR and move to the next one.""" + + @dataclasses.dataclass(frozen=True) class MergeDecision: ready: bool @@ -148,15 +153,38 @@ def latest_statuses_by_context(statuses: list[dict]) -> dict[str, dict]: return latest +def _is_tier_low_pending_ok( + latest_statuses: dict[str, dict], + context: str, + pr_labels: set[str], +) -> bool: + """Return True if tier:low PR can tolerate sop-checklist pending state. + + Per sop-checklist-config.yaml tier_failure_mode, tier:low uses soft-fail: + sop-checklist posts state=pending when acks are satisfied (missing + manager/ceo acks are informational only). The queue should accept + pending instead of waiting for success. + """ + if "tier:low" not in pr_labels: + return False + if "sop-checklist" not in context: + return False + status = latest_statuses.get(context) or {} + return status_state(status) == "pending" + + def required_contexts_green( latest_statuses: dict[str, dict], contexts: list[str], + pr_labels: set[str] | None = None, ) -> tuple[bool, list[str]]: missing_or_bad: list[str] = [] for context in contexts: status = latest_statuses.get(context) state = status_state(status or {}) if state != "success": + if pr_labels and _is_tier_low_pending_ok(latest_statuses, context, pr_labels): + continue # tier:low soft-fail: accept pending sop-checklist missing_or_bad.append(f"{context}={state or 'missing'}") return not missing_or_bad, missing_or_bad @@ -209,6 +237,7 @@ def evaluate_merge_readiness( pr_status: dict, required_contexts: list[str], pr_has_current_base: bool, + pr_labels: set[str] | None = None, ) -> MergeDecision: # Check push-required contexts explicitly instead of combined state. # Combined state can be "failure" due to non-blocking jobs @@ -228,7 +257,7 @@ def evaluate_merge_readiness( # The required_contexts list is the authoritative gate — it includes only # the checks that actually block merges. latest = latest_statuses_by_context(pr_status.get("statuses") or []) - ok, missing_or_bad = required_contexts_green(latest, required_contexts) + ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels) if not ok: return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad)) return MergeDecision(True, "merge", "ready") @@ -253,27 +282,32 @@ def get_combined_status(sha: str) -> dict: _, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status") if not isinstance(combined, dict): raise ApiError(f"status for {sha} response not object") - # Fetch full statuses list; 200 covers >99% of real-world runs. - # The list is ordered ascending by id (oldest first) — callers must - # iterate in reverse to get the newest entry per context. - # Best-effort: large repos (main with 550+ statuses) may time out. - # On timeout, fall back to the statuses[] already in the combined - # response (usually 30 entries — enough for most PRs, enough for - # main's early push-required contexts). + combined_statuses: list[dict] = combined.get("statuses") or [] try: - _, all_statuses = api( + _, all_statuses_raw = api( "GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/statuses", query={"limit": "50"}, ) - if isinstance(all_statuses, list): - combined["statuses"] = all_statuses + if isinstance(all_statuses_raw, list): + all_statuses: list[dict] = list(all_statuses_raw) + else: + all_statuses = [] except (ApiError, urllib.error.URLError, TimeoutError, OSError) as exc: - # URLError covers network-level failures (DNS, refused, timeout). - # TimeoutError and OSError cover socket-level timeouts. sys.stderr.write(f"::warning::could not fetch full statuses list for {sha[:8]}: {exc}\n") - # Fall back to the statuses[] already in the combined response. - pass + all_statuses = [] + # Build latest per context: process combined (ascending→reverse=newest + # first), then fill gaps from all_statuses (already newest-first). + latest: dict[str, dict] = {} + for status in reversed(sorted(combined_statuses, key=lambda s: s.get("id") or 0)): + ctx = status.get("context") + if isinstance(ctx, str) and ctx not in latest: + latest[ctx] = status + for status in all_statuses: + ctx = status.get("context") + if isinstance(ctx, str) and ctx not in latest: + latest[ctx] = status + combined["statuses"] = list(latest.values()) return combined @@ -338,7 +372,16 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None: print(f"::notice::merging PR #{pr_number}") if dry_run: return - api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False) + try: + api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False) + except ApiError as exc: + # Re-raise permission-like errors so process_once can skip this PR. + # 403 = no push access, 404 = repo/pr not found, 405 = not allowed. + msg = str(exc) + for code in ("403", "404", "405"): + if code in msg: + raise MergePermissionError(msg) from exc + raise # re-raise other ApiErrors unchanged def process_once(*, dry_run: bool = False) -> int: @@ -380,11 +423,13 @@ def process_once(*, dry_run: bool = False) -> int: commits = get_pull_commits(pr_number) current_base = pr_has_current_base(pr, commits, main_sha) pr_status = get_combined_status(head_sha) + pr_labels = label_names(pr) decision = evaluate_merge_readiness( main_status=main_status, pr_status=pr_status, required_contexts=contexts, pr_has_current_base=current_base, + pr_labels=pr_labels, ) print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}") @@ -407,7 +452,25 @@ def process_once(*, dry_run: bool = False) -> int: "deferring to next tick" ) return 0 - merge_pull(pr_number, dry_run=dry_run) + try: + merge_pull(pr_number, dry_run=dry_run) + except MergePermissionError as exc: + # Permanent merge failure (HTTP 403/404/405). Post a comment so + # maintainers know why, then return 0 so this tick is done. + # The PR stays in the queue; future ticks can retry after the + # permission issue is resolved. + sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n") + post_comment( + pr_number, + ( + "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. " + "No available token has Can-merge permission on this repo. " + "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. " + "Skipping to next queued PR on next tick." + ), + dry_run=dry_run, + ) + return 0 return 0 return 0 @@ -417,7 +480,21 @@ def main() -> int: parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() _require_runtime_env() - return process_once(dry_run=args.dry_run) + try: + return process_once(dry_run=args.dry_run) + except ApiError as exc: + # API errors (401/403/404/500) are transient for a queue tick — + # log and exit 0 so the workflow is not marked failed and the next + # tick can retry. Returning non-zero would permanently fail the + # workflow run, blocking future ticks. + sys.stderr.write(f"::error::queue API error: {exc}\n") + return 0 + except urllib.error.URLError as exc: + sys.stderr.write(f"::error::queue network error: {exc}\n") + return 0 + except TimeoutError as exc: + sys.stderr.write(f"::error::queue timeout: {exc}\n") + return 0 if __name__ == "__main__": diff --git a/.gitea/scripts/sop-checklist.py b/.gitea/scripts/sop-checklist.py index e6351df32..efd62e9c7 100644 --- a/.gitea/scripts/sop-checklist.py +++ b/.gitea/scripts/sop-checklist.py @@ -68,7 +68,7 @@ import sys import urllib.error import urllib.parse import urllib.request -from typing import Any +from typing import Any, Callable # --------------------------------------------------------------------------- @@ -110,7 +110,7 @@ def normalize_slug(raw: str, numeric_aliases: dict[int, str] | None = None) -> s # for /sop-revoke (RFC#351 open question 4 — reason is captured but not # yet validated; future iteration may require a min-length). _DIRECTIVE_RE = re.compile( - r"^[ \t]*/(sop-ack|sop-revoke)[ \t]+([A-Za-z0-9_\- ]+?)(?:[ \t]+(.*))?[ \t]*$", + r"^[ \t]*/(sop-ack|sop-revoke|sop-n/a)[ \t]+([A-Za-z0-9_\- ]+?)(?:[ \t]+(.*))?[ \t]*$", re.MULTILINE, ) @@ -118,19 +118,21 @@ _DIRECTIVE_RE = re.compile( def parse_directives( comment_body: str, numeric_aliases: dict[int, str], -) -> tuple[list[tuple[str, str, str]], list]: - """Extract /sop-ack and /sop-revoke directives from a comment body. +) -> tuple[list[tuple[str, str, str]], list[tuple[str, str, str]]]: + """Extract /sop-ack, /sop-revoke, and /sop-n/a directives from a comment body. - Returns (directives, na_directives) where: - directives is a list of (kind, canonical_slug, note) tuples - kind is "sop-ack" or "sop-revoke" - canonical_slug is the normalized form (or "" if unparseable) - note is the trailing free-text (may be "") - na_directives is reserved for future N/A handling (always [] for now) + Returns (directives, na_directives) where each is a list of + (kind, canonical_slug, note) tuples: + kind is "sop-ack", "sop-revoke", or "sop-n/a" + canonical_slug is the normalized form (or "" if unparseable) + note is the trailing free-text (may be "") + The two lists are kept separate so call sites can unpack them + directly (e.g. directives, na_directives = parse_directives(...)). """ - out: list[tuple[str, str, str]] = [] + directives: list[tuple[str, str, str]] = [] + na_directives: list[tuple[str, str, str]] = [] if not comment_body: - return out, [] + return directives, na_directives for m in _DIRECTIVE_RE.finditer(comment_body): kind = m.group(1) raw_slug = (m.group(2) or "").strip() @@ -160,8 +162,12 @@ def parse_directives( note_from_group = (m.group(3) or "").strip() # If we collapsed multi-word slug into kebab and there's a # trailing-text group too, append it. - out.append((kind, canonical, note_from_group)) - return out, [] + entry = (kind, canonical, note_from_group) + if kind == "sop-n/a": + na_directives.append(entry) + else: + directives.append(entry) + return directives, na_directives # --------------------------------------------------------------------------- @@ -174,8 +180,8 @@ def section_marker_present(body: str, marker: str) -> bool: on a non-empty line (i.e. the author actually filled it in). We require the marker substring AND non-whitespace content on the - same line OR within the next line — this prevents trivially-empty - checklists like: + same line OR within the next non-blank line — this prevents + trivially-empty checklists like: ## SOP-Checklist - [ ] **Comprehensive testing performed**: @@ -184,9 +190,18 @@ def section_marker_present(body: str, marker: str) -> bool: from auto-passing the section-present check. The peer-ack is still required, but answering with empty content is captured as a soft finding via the section-present test alone. + + NOTE: we scan forward through blank lines (the markdown-header pattern + is ## Header\\n\\ncontent) so that a header + blank-line + content + structure still satisfies the check. The backward checkbox fallback + catches inline markers without a preceding checkbox (mc#1099). """ if not body or not marker: return False + # Strip trailing whitespace so the blank-line scan below can find + # content that appears on the very last line of the body (without + # being misled by a trailing \n or spaces). + body = body.rstrip() body_lower = body.lower() marker_lower = marker.lower() idx = body_lower.find(marker_lower) @@ -202,13 +217,44 @@ def section_marker_present(body: str, marker: str) -> bool: stripped = re.sub(r"[\s\*:\-\[\]]+", "", line) if stripped: return True - # Fall through: check the NEXT line (multi-line answers). - next_line_end = body.find("\n", line_end + 1) - if next_line_end < 0: - next_line_end = len(body) - next_line = body[line_end + 1:next_line_end] - stripped_next = re.sub(r"[\s\*:\-\[\]]+", "", next_line) - return bool(stripped_next) + # Fall through: scan forward, skipping blank-only lines, until we find + # non-empty content or run out of body. Handles: + # ## Header ← marker line (empty after marker) + # ← blank line (skipped) + # - actual content ← found + pos = line_end + while True: + # Skip the current newline and any additional newlines (blank lines). + while pos < len(body) and body[pos] == "\n": + pos += 1 + if pos >= len(body): + break + line_end = body.find("\n", pos) + if line_end < 0: + line_end = len(body) + line = body[pos:line_end] + stripped = re.sub(r"[\s\*:\-\[\]]+", "", line) + if stripped: + return True + pos = line_end + # Last resort: the marker may appear mid-sentence (e.g. + # **Memory/saved-feedback consulted**: No applicable...). + # Search backward within the CURRENT LINE only (not preceding lines) + # to find a checkbox on the same line before the marker text. + # mc#1099 follow-up: memory-consulted detection was failing because + # the checkbox was on the same line before the inline marker. + _CHECKBOX_RE = re.compile(r"- \[[ x\]]| dict[str, dict[str, Any]]: + """Evaluate which N/A gates have a valid declaration from a team member. + + Returns dict[gate_name, dict] where each dict has: + declared: bool — at least one valid non-author team-member declared N/A + decl_ackers: list[str] — usernames who declared this gate N/A + rejected: dict with keys: + not_in_team: list[str] — users who tried but aren't in required teams + """ + # Build per-user latest N/A directive (most-recent wins per RFC#324). + latest_na: dict[str, tuple[str, str]] = {} # user → (gate, note) + for c in comments: + body = c.get("body", "") or "" + user = (c.get("user") or {}).get("login", "") + if not user: + continue + for kind, gate, note in parse_directives(body, {})[1]: + # [1] = na_directives only + if gate in na_gates: + latest_na[user] = (gate, note) + + result: dict[str, dict[str, Any]] = {} + for gate, gate_cfg in na_gates.items(): + result[gate] = { + "declared": False, + "decl_ackers": [], + "rejected": {"not_in_team": []}, + } + decl_ackers: list[str] = [] + not_in_team: list[str] = [] + for user, (g, _note) in latest_na.items(): + if g != gate: + continue + if user == author: + continue # authors cannot self-declare N/A + approved = probe(gate, [user]) + if approved: + decl_ackers.append(user) + else: + not_in_team.append(user) + result[gate]["declared"] = bool(decl_ackers) + result[gate]["decl_ackers"] = decl_ackers + result[gate]["rejected"]["not_in_team"] = not_in_team + + return result + + # --------------------------------------------------------------------------- # Gitea API client # --------------------------------------------------------------------------- @@ -698,6 +800,7 @@ def main(argv: list[str] | None = None) -> int: cfg = load_config(args.config) items: list[dict[str, Any]] = cfg["items"] items_by_slug = {it["slug"]: it for it in items} + na_gates: dict[str, Any] = cfg.get("n/a_gates", {}) numeric_aliases = { int(it["numeric_alias"]): it["slug"] for it in items if it.get("numeric_alias") } @@ -818,6 +921,46 @@ def main(argv: list[str] | None = None) -> int: description=description, target_url=target_url, ) print(f"::notice::status posted: {args.status_context} → {state}") + + # --- N/A gate status (RFC#324 §N/A follow-up) --- + # Post a separate status so review-check.sh can discover N/A declarations + # and waive the Gitea-approve requirement for that gate. + na_state: dict[str, dict[str, Any]] = {} + if na_gates: + na_state = compute_na_state(comments, author, na_gates, probe) + + na_descs: list[str] = [] + for gate, s in na_state.items(): + if s["declared"]: + na_descs.append(gate) + decl = s["decl_ackers"] + rej = s["rejected"]["not_in_team"] + if decl: + print(f"::notice:: [N/A OK] {gate} — declared by {','.join(decl)}") + if rej: + print( + f"::notice:: [N/A REJ] {gate} — not-in-team: {','.join(rej)}", + file=sys.stderr, + ) + + na_desc = ", ".join(sorted(na_descs)) if na_descs else "(none)" + na_status_state = "success" if na_descs else "pending" + # review-check.sh reads the description to discover which gates are N/A. + # Include the gate names so it can grep for them. + na_description = f"N/A: {na_desc}" if na_descs else "N/A: (none)" + + if not args.dry_run: + client.post_status( + args.owner, args.repo, head_sha, + state=na_status_state, + context="sop-checklist / na-declarations (pull_request)", + description=na_description, + target_url=target_url, + ) + print( + f"::notice::na-declarations status → {na_status_state}: {na_description}" + ) + # By default exit 0 — the POSTed status IS the gate, NOT the job # conclusion. If the job exits 1 BP will see TWO failure signals # (one from the job's auto-status, one from our POST), making the diff --git a/.gitea/scripts/tests/test_gitea_merge_queue.py b/.gitea/scripts/tests/test_gitea_merge_queue.py index b01c6da22..d4ef81271 100644 --- a/.gitea/scripts/tests/test_gitea_merge_queue.py +++ b/.gitea/scripts/tests/test_gitea_merge_queue.py @@ -118,3 +118,13 @@ def test_merge_decision_updates_stale_pr_before_merge(): assert decision.ready is False assert decision.action == "update" + + +def test_MergePermissionError_inherits_from_ApiError(): + assert issubclass(mq.MergePermissionError, mq.ApiError) + + +def test_MergePermissionError_message_preserved(): + exc = mq.MergePermissionError("POST /merge -> HTTP 405: User not allowed") + assert "405" in str(exc) + assert "User not allowed" in str(exc) diff --git a/.gitea/scripts/tests/test_sop_checklist.py b/.gitea/scripts/tests/test_sop_checklist.py index 24fbc54ce..91c016a13 100644 --- a/.gitea/scripts/tests/test_sop_checklist.py +++ b/.gitea/scripts/tests/test_sop_checklist.py @@ -551,3 +551,55 @@ class TestEndToEndAckFlow(unittest.TestCase): if __name__ == "__main__": unittest.main(verbosity=2) + + +# --------------------------------------------------------------------------- +# compute_na_state +# --------------------------------------------------------------------------- + + +class TestComputeNaState(unittest.TestCase): + """Tests for /sop-n/a directive evaluation.""" + + def test_no_na_declarations(self): + cfg = sop.load_config(CONFIG_PATH) + na_gates = cfg.get("n/a_gates", {}) + comments = [] + na_state = sop.compute_na_state(comments, "alice", na_gates, lambda *_: []) + self.assertFalse(na_state["qa-review"]["declared"]) + self.assertFalse(na_state["security-review"]["declared"]) + + def test_na_declared_by_authorized_user(self): + cfg = sop.load_config(CONFIG_PATH) + na_gates = cfg.get("n/a_gates", {}) + comments = [_comment("bob", "/sop-n/a qa-review N/A: pure tooling change")] + na_state = sop.compute_na_state(comments, "alice", na_gates, lambda g, u: u) + self.assertTrue(na_state["qa-review"]["declared"]) + self.assertEqual(na_state["qa-review"]["decl_ackers"], ["bob"]) + + def test_na_declared_by_unauthorized_user_rejected(self): + cfg = sop.load_config(CONFIG_PATH) + na_gates = cfg.get("n/a_gates", {}) + comments = [_comment("mallory", "/sop-n/a qa-review N/A: not real team")] + na_state = sop.compute_na_state(comments, "alice", na_gates, lambda g, u: []) + self.assertFalse(na_state["qa-review"]["declared"]) + self.assertEqual(na_state["qa-review"]["rejected"]["not_in_team"], ["mallory"]) + + def test_author_cannot_self_declare_na(self): + cfg = sop.load_config(CONFIG_PATH) + na_gates = cfg.get("n/a_gates", {}) + comments = [_comment("alice", "/sop-n/a qa-review N/A: I am the author")] + na_state = sop.compute_na_state(comments, "alice", na_gates, lambda g, u: u) + self.assertFalse(na_state["qa-review"]["declared"]) + + def test_parse_directives_separates_na_from_ack(self): + directives, na_directives = sop.parse_directives( + "/sop-ack comprehensive-testing\n/sop-n/a qa-review N/A: no surface", + {}, + ) + self.assertEqual(len(directives), 1) + self.assertEqual(directives[0][0], "sop-ack") + self.assertEqual(len(na_directives), 1) + self.assertEqual(na_directives[0][0], "sop-n/a") + self.assertEqual(na_directives[0][1], "qa-review") + self.assertIn("no surface", na_directives[0][2]) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 8438221b3..6c98159e4 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -348,16 +348,15 @@ jobs: # Shellcheck (E2E scripts) — required check, always runs. shellcheck: name: Shellcheck (E2E scripts) - needs: changes runs-on: ubuntu-latest # Phase 4 (RFC #219 §1): confirmed green on main 2026-05-12. continue-on-error: false steps: - - if: needs.changes.outputs.scripts != 'true' + - if: false run: echo "No tests/e2e/ or infra/scripts/ changes — skipping real shellcheck; this job always runs to satisfy the required-check name on branch protection." - - if: needs.changes.outputs.scripts == 'true' + - if: always() uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - if: needs.changes.outputs.scripts == 'true' + - if: always() name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh # shellcheck is pre-installed on ubuntu-latest runners (via apt). # infra/scripts/ is included because setup.sh + nuke.sh gate the @@ -368,16 +367,16 @@ jobs: find tests/e2e infra/scripts -type f -name '*.sh' -print0 \ | xargs -0 shellcheck --severity=warning - - if: needs.changes.outputs.scripts == 'true' + - if: always() name: Lint cleanup-trap hygiene (RFC #2873) run: bash tests/e2e/lint_cleanup_traps.sh - - if: needs.changes.outputs.scripts == 'true' + - if: always() name: Run E2E bash unit tests (no live infra) run: | bash tests/e2e/test_model_slug.sh - - if: needs.changes.outputs.scripts == 'true' + - if: always() name: Test ECR promote-tenant-image script (mock-driven, no live infra) # Covers scripts/promote-tenant-image.sh — the codified # :staging-latest → :latest ECR promote + tenant fleet redeploy @@ -387,7 +386,7 @@ jobs: run: | bash scripts/test-promote-tenant-image.sh - - if: needs.changes.outputs.scripts == 'true' + - if: always() name: Shellcheck promote-tenant-image script # scripts/ is excluded from the bulk shellcheck pass above (legacy # SC3040/SC3043 cleanup pending). Run shellcheck explicitly on @@ -407,8 +406,8 @@ jobs: # ci_job_names() detects this as github.ref-gated and skips it from F1. # The step-level exit 0 handles the "not main push" case; the job-level # `if:` makes the gating explicit so the drift script sees it. - # continue-on-error removed (was mc#774 mask): step exits 0 when not applicable. - if: ${{ github.ref == 'refs/heads/staging' }} + # Runs on both main and staging pushes; step exits 0 when not applicable. + if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging' }} needs: [changes, canvas-build] steps: - name: Write deploy reminder to step summary @@ -459,7 +458,6 @@ jobs: # Python Lint & Test — required check, always runs. python-lint: name: Python Lint & Test - needs: changes runs-on: ubuntu-latest # Phase 4 (RFC #219 §1): confirmed green on main 2026-05-12. continue-on-error: false @@ -469,25 +467,25 @@ jobs: run: working-directory: workspace steps: - - if: needs.changes.outputs.python != 'true' + - if: false working-directory: . run: echo "No workspace/** changes — skipping real lint+test; this job always runs to satisfy the required-check name on branch protection." - - if: needs.changes.outputs.python == 'true' + - if: always() uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - if: needs.changes.outputs.python == 'true' + - if: always() uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3.11' cache: pip cache-dependency-path: workspace/requirements.txt - - if: needs.changes.outputs.python == 'true' + - if: always() run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0 # Coverage flags + fail-under floor moved into workspace/pytest.ini # (issue #1817) so local `pytest` and CI use identical config. - - if: needs.changes.outputs.python == 'true' + - if: always() run: python -m pytest --tb=short - - if: needs.changes.outputs.python == 'true' + - if: always() name: Per-file critical-path coverage (MCP / inbox / auth) # MCP-critical Python files have a per-file floor on top of the # 86% total floor in pytest.ini. See issue #2790 for full rationale. @@ -552,86 +550,104 @@ jobs: # red silently merged through. See internal#286 for the three concrete # tonight-of-2026-05-11 incidents that prompted the emergency bump. # - # Three properties of this job each close a failure mode: + # This job deliberately has no `needs:`. Gitea 1.22/act_runner can mark a + # job-level `if: always()` + `needs:` sentinel as skipped before upstream + # jobs settle, leaving branch protection with a permanent pending + # `CI / all-required` context. Instead, this independent sentinel polls the + # required commit-status contexts for this SHA and fails if any fail, skip, + # or never emit. # - # 1. `if: always()` — runs even when an upstream fails. Without it the - # sentinel is `skipped` and protection treats that as missing → merge - # ungated. + # canvas-deploy-reminder is intentionally NOT included in all-required.needs. + # It is an informational main-push reminder, not a PR quality gate. Keeping + # it in this dependency list lets a skipped reminder skip the required + # sentinel before the `always()` guard can emit a branch-protection status. # - # 2. Assertion is `result == "success"` per dep, NOT `!= "failure"`. - # A `skipped` upstream (job gated by `if:` evaluating false, matrix - # entry that couldn't run) must NOT silently pass through. - # `skipped`-as-green is exactly the failure mode this gate closes. - # - # 3. `needs:` is the canonical list of "what counts as required." - # status_check_contexts will reference only `ci/all-required` (Step 5 - # follow-up — branch-protection PATCH is Owners-tier per - # `feedback_never_admin_merge_bypass`, separate PR); a new job is - # added simply by listing it in `needs:` here. - # `.gitea/workflows/ci-required-drift.yml` files a [ci-drift] issue - # hourly if this list diverges from status_check_contexts or from - # audit-force-merge.yml's REQUIRED_CHECKS env (RFC §4 + §6). - # - # canvas-deploy-reminder is intentionally excluded from all-required.needs: - # it needs canvas-build, which is skipped on CI-only PRs (canvas=false). - # Including it in all-required.needs causes all-required to hang on - # every CI-only PR. Keep it runnable on PRs via its own - # `needs: [changes, canvas-build]` — the sentinel only aggregates the result. - # - # Phase 3 (RFC #219 §1) safety: underlying build jobs carry - # continue-on-error: true so their failures are masked to null (2026-05-12: re-enabled mc#774 interim) - # (Gitea suppresses status reporting for CoE jobs). This sentinel - # runs with continue-on-error: false so it always reports its - # result to the API — without this, the required-status entry - # (CI / all-required (pull_request)) is never created, which - # blocks PR merges. When Phase 3 ends, flip underlying jobs to - # continue-on-error: false; this sentinel can then be flipped to - # continue-on-error: true if a Phase-4 regression requires it. continue-on-error: false runs-on: ubuntu-latest - timeout-minutes: 1 - needs: - - changes - - platform-build - - canvas-build - - shellcheck - - python-lint - - canvas-deploy-reminder - if: ${{ always() }} + timeout-minutes: 45 steps: - - name: Assert every required dependency succeeded + - name: Wait for required CI contexts + env: + GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }} + API_ROOT: ${{ github.server_url }}/api/v1 + REPOSITORY: ${{ github.repository }} + COMMIT_SHA: ${{ github.sha }} + EVENT_NAME: ${{ github.event_name }} run: | set -euo pipefail - # `needs.*.result` is one of: success | failure | cancelled | skipped | null. - # We assert success per dep (not != failure) — see RFC §2 reasoning above. - # Null results are skipped: they come from Phase 3 (continue-on-error: true - # suppresses status) or from jobs still in-flight. The sentinel succeeds - # rather than blocking PRs on Phase 3 noise. - results='${{ toJSON(needs) }}' - echo "$results" - echo "$results" | python3 -c ' - import json, sys - ns = json.load(sys.stdin) - # Phase 3 masked: jobs with continue-on-error: true may report "failure" - # Remove when mc#774 handler test failures are resolved. - PHASE3_MASKED = {"platform-build"} - # Exclude null (Phase 3 suppressed / in-flight) from the bad list. - bad = [(k, v.get("result")) for k, v in ns.items() - if v.get("result") not in ("success", None, "cancelled", "skipped") and k not in PHASE3_MASKED] - if bad: - print(f"FAIL: jobs not green:", file=sys.stderr) - for k, r in bad: - print(f" - {k}: {r}", file=sys.stderr) - sys.exit(1) - pending = [(k, v.get("result")) for k, v in ns.items() - if v.get("result") is None] - cancelled = [(k, v.get("result")) for k, v in ns.items() - if v.get("result") == "cancelled"] - if pending: - print(f"WARN: {len(pending)} job(s) still in-flight (result=null): " + - ", ".join(k for k, _ in pending), file=sys.stderr) - if cancelled: - print(f"INFO: {len(cancelled)} job(s) masked by continue-on-error: " + - ", ".join(k for k, _ in cancelled), file=sys.stderr) - print(f"OK: all {len(ns)} required jobs succeeded (or Phase-3 suppressed)") - ' + python3 - <<'PY' + import json + import os + import sys + import time + import urllib.error + import urllib.request + + token = os.environ["GITEA_TOKEN"] + api_root = os.environ["API_ROOT"].rstrip("/") + repo = os.environ["REPOSITORY"] + sha = os.environ["COMMIT_SHA"] + event = os.environ["EVENT_NAME"] + required = [ + f"CI / Detect changes ({event})", + f"CI / Platform (Go) ({event})", + f"CI / Canvas (Next.js) ({event})", + f"CI / Shellcheck (E2E scripts) ({event})", + f"CI / Python Lint & Test ({event})", + ] + terminal_bad = {"failure", "error"} + deadline = time.time() + 40 * 60 + last_summary = None + + def fetch_statuses(): + statuses = [] + for page in range(1, 6): + url = f"{api_root}/repos/{repo}/commits/{sha}/statuses?page={page}&limit=100" + req = urllib.request.Request(url, headers={"Authorization": f"token {token}"}) + with urllib.request.urlopen(req, timeout=10) as resp: + chunk = json.load(resp) + if not chunk: + break + statuses.extend(chunk) + latest = {} + for item in statuses: + ctx = item.get("context") + if not ctx: + continue + prev = latest.get(ctx) + if prev is None or (item.get("updated_at") or item.get("created_at") or "") >= (prev.get("updated_at") or prev.get("created_at") or ""): + latest[ctx] = item + return latest + + while True: + try: + latest = fetch_statuses() + except (TimeoutError, OSError, urllib.error.URLError) as exc: + if time.time() >= deadline: + print(f"FAIL: status polling did not recover before deadline: {exc}", file=sys.stderr) + sys.exit(1) + print(f"WARN: status poll failed, retrying: {exc}", flush=True) + time.sleep(15) + continue + states = {ctx: (latest.get(ctx) or {}).get("status") or (latest.get(ctx) or {}).get("state") or "missing" for ctx in required} + summary = ", ".join(f"{ctx}={state}" for ctx, state in states.items()) + if summary != last_summary: + print(summary, flush=True) + last_summary = summary + bad = {ctx: state for ctx, state in states.items() if state in terminal_bad} + if bad: + print("FAIL: required CI context failed:", file=sys.stderr) + for ctx, state in bad.items(): + desc = (latest.get(ctx) or {}).get("description") or "" + print(f" - {ctx}: {state} {desc}", file=sys.stderr) + sys.exit(1) + if all(state == "success" for state in states.values()): + print(f"OK: all {len(required)} required CI contexts succeeded") + sys.exit(0) + if time.time() >= deadline: + print("FAIL: timed out waiting for required CI contexts:", file=sys.stderr) + for ctx, state in states.items(): + print(f" - {ctx}: {state}", file=sys.stderr) + sys.exit(1) + time.sleep(15) + PY diff --git a/.gitea/workflows/e2e-api.yml b/.gitea/workflows/e2e-api.yml index 5df6efffa..7678b92ca 100644 --- a/.gitea/workflows/e2e-api.yml +++ b/.gitea/workflows/e2e-api.yml @@ -69,6 +69,13 @@ name: E2E API Smoke Test # 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when # they DO come up. Timeouts are not the bottleneck; not bumped. # +# Item #1046 (fixed 2026-05-14): Stale platform-server from cancelled runs +# lingers on :8080 after "Stop platform" step is skipped (workflow cancelled +# before reaching line 335). Added a pre-start "Kill stale platform-server" +# step (line 286) that scans /proc for zombie platform-server processes +# and kills them before the port probe or bind. Makes the ephemeral port +# probe + start sequence deterministic. +# # Item explicitly NOT fixed here: failing test `Status back online` # fails because the platform's langgraph workspace template image # (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns @@ -283,6 +290,35 @@ jobs: echo "PORT=${PLATFORM_PORT}" >> "$GITHUB_ENV" echo "BASE=http://127.0.0.1:${PLATFORM_PORT}" >> "$GITHUB_ENV" echo "Platform host port: ${PLATFORM_PORT}" + - name: Kill stale platform-server before start (issue #1046) + if: needs.detect-changes.outputs.api == 'true' + run: | + # Concurrent runs on the same host-network act_runner can leave a + # zombie platform-server from a cancelled/timeout run. Cancelled + # runs never reach the "Stop platform" step (line 335), so the + # old process lingers. Kill it before the ephemeral port probe + # or start so the port is definitively free. + # + # /proc scan — works on any Linux without pkill/lsof/ss. + # comm field is truncated to 15 chars: "platform-serve" matches + # "platform-server". Verify with cmdline to avoid false positives. + killed=0 + for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do + kpid="${pid%/comm}" + kpid="${kpid##*/}" + cmdline=$(cat "/proc/${kpid}/cmdline" 2>/dev/null | tr '\0' ' ') + if echo "$cmdline" | grep -q "platform-server"; then + echo "Killing stale platform-server pid ${kpid}: ${cmdline}" + kill "$kpid" 2>/dev/null || true + killed=$((killed + 1)) + fi + done + if [ "$killed" -gt 0 ]; then + sleep 2 + echo "Killed $killed stale process(es); port(s) released." + else + echo "No stale platform-server found." + fi - name: Start platform (background) if: needs.detect-changes.outputs.api == 'true' working-directory: workspace-server @@ -346,3 +382,4 @@ jobs: run: | docker rm -f "$PG_CONTAINER" 2>/dev/null || true docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true + diff --git a/.gitea/workflows/e2e-chat.yml b/.gitea/workflows/e2e-chat.yml index 35d5c2048..b25f809ee 100644 --- a/.gitea/workflows/e2e-chat.yml +++ b/.gitea/workflows/e2e-chat.yml @@ -97,7 +97,7 @@ jobs: cache-dependency-path: workspace-server/go.sum - if: needs.detect-changes.outputs.chat == 'true' - uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d6f5 # v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: '22' cache: 'npm' @@ -175,6 +175,19 @@ jobs: echo "E2E_PLATFORM_URL=http://127.0.0.1:${PLATFORM_PORT}" >> "$GITHUB_ENV" echo "Platform host port: ${PLATFORM_PORT}" + - name: Pick canvas port + if: needs.detect-changes.outputs.chat == 'true' + run: | + CANVAS_PORT=$(python3 - <<'PY' + import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + print(s.getsockname()[1]) + PY + ) + echo "CANVAS_PORT=${CANVAS_PORT}" >> "$GITHUB_ENV" + echo "Canvas host port: ${CANVAS_PORT}" + - name: Start platform (background) if: needs.detect-changes.outputs.chat == 'true' working-directory: workspace-server @@ -183,6 +196,7 @@ jobs: export DATABASE_URL="${DATABASE_URL}" export REDIS_URL="${REDIS_URL}" export PORT="${PLATFORM_PORT}" + export CORS_ORIGINS="http://localhost:3000,http://localhost:3001,http://localhost:${CANVAS_PORT},http://127.0.0.1:${CANVAS_PORT}" ./platform-server > platform.log 2>&1 & echo $! > platform.pid @@ -216,10 +230,10 @@ jobs: run: | export NEXT_PUBLIC_PLATFORM_URL="http://127.0.0.1:${PLATFORM_PORT}" export NEXT_PUBLIC_WS_URL="ws://127.0.0.1:${PLATFORM_PORT}/ws" - npm run dev > canvas.log 2>&1 & + npx next dev --turbopack -p "${CANVAS_PORT}" > canvas.log 2>&1 & echo $! > canvas.pid for i in $(seq 1 30); do - if curl -sf http://localhost:3000 > /dev/null 2>&1; then + if curl -sf "http://localhost:${CANVAS_PORT}" > /dev/null 2>&1; then echo "Canvas up after ${i}s" exit 0 fi @@ -235,6 +249,7 @@ jobs: run: | export E2E_PLATFORM_URL="http://127.0.0.1:${PLATFORM_PORT}" export E2E_DATABASE_URL="${DATABASE_URL}" + export PLAYWRIGHT_BASE_URL="http://localhost:${CANVAS_PORT}" npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts - name: Dump platform log on failure diff --git a/.gitea/workflows/e2e-peer-visibility.yml b/.gitea/workflows/e2e-peer-visibility.yml new file mode 100644 index 000000000..f7b13f161 --- /dev/null +++ b/.gitea/workflows/e2e-peer-visibility.yml @@ -0,0 +1,225 @@ +name: E2E Peer Visibility (literal MCP list_peers) + +# WHY A DEDICATED WORKFLOW (not folded into e2e-staging-saas.yml) +# -------------------------------------------------------------- +# This is the systemic fix for a real trust failure. Hermes and OpenClaw +# were reported "fleet-verified / cascade-complete" because the *proxy* +# signals were green (registry registration + heartbeat for Hermes; model +# round-trip 200 for OpenClaw). A freshly-provisioned workspace asked on +# canvas "can you see your peers" actually FAILS: +# - Hermes: 401 on the molecule MCP `list_peers` call +# - OpenClaw: native `sessions_list` fallback, sees no platform peers +# Tasks #142/#159 were even marked "completed" under this proxy flaw. +# +# A dedicated workflow (vs extending e2e-staging-saas.yml) because: +# - It must provision MULTIPLE distinct runtimes (hermes, openclaw, +# claude-code) in ONE org and assert each sees the others. The +# full-saas script is single-runtime-per-run (E2E_RUNTIME) and folding +# a multi-runtime matrix into it would conflate concerns and bloat its +# already-45-min run. +# - It needs its own concurrency group so it doesn't fight full-saas / +# canvas for the staging org-creation quota. +# - It needs an independent, non-required status-context name so it can +# be RED today (the in-flight Hermes-401 / OpenClaw-MCP-wiring fixes +# have not landed) WITHOUT wedging unrelated merges — and flipped to +# REQUIRED in one branch-protection edit once it goes green +# (flip-to-required checklist: molecule-core#1296). +# +# THE ASSERTION IS NOT A PROXY. The driving script +# tests/e2e/test_peer_visibility_mcp_staging.sh issues the byte-for-byte +# JSON-RPC `tools/call name=list_peers` envelope to `POST +# /workspaces/:id/mcp` using each workspace's OWN bearer token, through +# the real WorkspaceAuth + MCPRateLimiter middleware chain — the exact +# call mcp_molecule_list_peers makes from a canvas agent. It does NOT +# read a registry row, /health, the heartbeat table, or +# GET /registry/:id/peers. +# +# HONEST GATE — NO continue-on-error. Per feedback_fix_root_not_symptom a +# fake-green mask would defeat the entire purpose. This workflow goes red +# on today's broken behavior and green only when the root-cause fixes +# actually land. It is intentionally NOT in branch_protections — see PR +# body for the required-vs-not decision + flip tracking issue. +# +# Gitea 1.22.6 / act_runner notes honored: +# - No cross-repo `uses:` (feedback_gitea_cross_repo_uses_blocked). The +# actions/checkout SHA is the one e2e-staging-canvas.yml already uses +# successfully (a mirrored SHA — see #1277/PR#1292 root-cause). +# - Per-SHA concurrency, not global (feedback_concurrency_group_per_sha). +# - Workflow-level GITHUB_SERVER_URL pinned +# (feedback_act_runner_github_server_url). +# - pr-validate posts a status under the same check name so a +# workflow-only PR is not silently statusless and the context is +# flip-to-required-ready (mirrors e2e-staging-saas.yml's proven shape; +# real EC2-provisioning E2E is push/dispatch/cron only — it is 30+ min +# and cannot run per-PR-update). + +on: + push: + branches: [main] + paths: + - 'workspace-server/internal/handlers/mcp.go' + - 'workspace-server/internal/handlers/mcp_tools.go' + - 'workspace-server/internal/middleware/**' + - 'workspace-server/internal/handlers/registry.go' + - 'workspace-server/internal/handlers/workspace.go' + - 'workspace/a2a_mcp_server.py' + - 'workspace/platform_tools/registry.py' + - 'tests/e2e/test_peer_visibility_mcp_staging.sh' + - '.gitea/workflows/e2e-peer-visibility.yml' + pull_request: + branches: [main] + paths: + - 'workspace-server/internal/handlers/mcp.go' + - 'workspace-server/internal/handlers/mcp_tools.go' + - 'workspace-server/internal/middleware/**' + - 'workspace-server/internal/handlers/registry.go' + - 'workspace-server/internal/handlers/workspace.go' + - 'workspace/a2a_mcp_server.py' + - 'workspace/platform_tools/registry.py' + - 'tests/e2e/test_peer_visibility_mcp_staging.sh' + - '.gitea/workflows/e2e-peer-visibility.yml' + workflow_dispatch: + schedule: + # 07:30 UTC daily — catches AMI / template-hermes / template-openclaw + # drift even on quiet days. Offset 30m from e2e-staging-saas (07:00) + # so the two don't collide on the staging org-creation quota. + - cron: '30 7 * * *' + +concurrency: + # Per-SHA (feedback_concurrency_group_per_sha). A single global group + # would let a queued staging/main push behind a PR run get cancelled, + # leaving any gate that reads "completed run at SHA" stuck. + group: e2e-peer-visibility-${{ github.event.pull_request.head.sha || github.sha }} + cancel-in-progress: false + +env: + GITHUB_SERVER_URL: https://git.moleculesai.app + +jobs: + # PR path: post a real status under the required-ready check name so a + # workflow-only PR is never silently statusless. The actual EC2 E2E is + # push/dispatch/cron only (30+ min). This is NOT a fake-green mask of + # the real assertion — it validates the driving script's bash syntax + # and inline-python so a broken test script fails at PR time. + pr-validate: + name: E2E Peer Visibility + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + timeout-minutes: 5 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Validate driving script + run: | + bash -n tests/e2e/test_peer_visibility_mcp_staging.sh + echo "test_peer_visibility_mcp_staging.sh — bash syntax OK" + echo "Real fresh-provision MCP list_peers E2E runs on push to" + echo "main / workflow_dispatch / daily cron (30+ min EC2 boot)." + + # Real gate: provisions a throwaway org + sibling-per-runtime, drives + # the LITERAL list_peers MCP call per runtime, asserts 200 + expected + # peer set, then scoped teardown. push(main)/dispatch/cron only. + peer-visibility: + name: E2E Peer Visibility + runs-on: ubuntu-latest + if: github.event_name != 'pull_request' + timeout-minutes: 60 + + env: + MOLECULE_CP_URL: https://staging-api.moleculesai.app + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + # LLM provider key so each runtime can authenticate at boot. + # Priority MiniMax → direct-Anthropic → OpenAI matches + # test_staging_full_saas.sh's secrets-injection chain. + E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }} + E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }} + E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }} + E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" + PV_RUNTIMES: "hermes openclaw claude-code" + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Verify admin token present + run: | + if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then + echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" + exit 2 + fi + echo "Admin token present" + + - name: Verify an LLM key present + run: | + if [ -z "${E2E_MINIMAX_API_KEY:-}" ] && [ -z "${E2E_ANTHROPIC_API_KEY:-}" ] && [ -z "${E2E_OPENAI_API_KEY:-}" ]; then + echo "::error::No LLM provider key set — workspaces fail at boot with 'No provider API key found'. Set MOLECULE_STAGING_MINIMAX_API_KEY (or ANTHROPIC / OPENAI)." + exit 2 + fi + echo "LLM key present" + + - name: CP staging health preflight + run: | + code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health") + if [ "$code" != "200" ]; then + echo "::error::Staging CP unhealthy (HTTP $code) — infra, not a workspace bug. Failing loud per feedback_fix_root_not_symptom." + exit 1 + fi + echo "Staging CP healthy" + + - name: Run fresh-provision peer-visibility E2E (literal MCP list_peers) + run: bash tests/e2e/test_peer_visibility_mcp_staging.sh + + # Belt-and-braces scoped teardown: the script installs an EXIT/INT/ + # TERM trap, but if the runner itself is cancelled the trap may not + # fire. This always() step deletes ONLY the e2e-pv- org this + # run created — never a cluster-wide sweep + # (feedback_never_run_cluster_cleanup_tests_on_live_platform). The + # admin DELETE is idempotent so double-invoking is safe; + # sweep-stale-e2e-orgs is the final net (slug starts with 'e2e-'). + - name: Teardown safety net (runs on cancel/failure) + if: always() + env: + ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + run: | + set +e + orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \ + -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ + | python3 -c " + import json, sys, os, datetime + run_id = os.environ.get('GITHUB_RUN_ID', '') + try: + d = json.load(sys.stdin) + except Exception: + print(''); sys.exit(0) + # ONLY sweep slugs from THIS run. e2e-pv---... + # Sweep today AND yesterday's UTC date so a midnight-crossing run + # still matches its own slug (same bug class as the saas/canvas + # safety nets). + today = datetime.date.today() + yest = today - datetime.timedelta(days=1) + dates = (today.strftime('%Y%m%d'), yest.strftime('%Y%m%d')) + if run_id: + prefixes = tuple(f'e2e-pv-{dt}-{run_id}-' for dt in dates) + else: + prefixes = tuple(f'e2e-pv-{dt}-' for dt in dates) + orgs = d if isinstance(d, list) else d.get('orgs', []) + cands = [o['slug'] for o in orgs + if any(o.get('slug','').startswith(p) for p in prefixes) + and o.get('instance_status') not in ('purged',)] + print('\n'.join(cands)) + " 2>/dev/null) + for slug in $orgs; do + echo "Safety-net teardown: $slug" + set +e + curl -sS -o /tmp/pv-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"confirm\":\"$slug\"}" >/tmp/pv-cleanup.code + set -e + code=$(cat /tmp/pv-cleanup.code 2>/dev/null || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::pv teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within MAX_AGE_MINUTES. Body: $(head -c 300 /tmp/pv-cleanup.out 2>/dev/null)" + fi + done + exit 0 diff --git a/.gitea/workflows/gate-check-v3.yml b/.gitea/workflows/gate-check-v3.yml index b1175977e..27aba8798 100644 --- a/.gitea/workflows/gate-check-v3.yml +++ b/.gitea/workflows/gate-check-v3.yml @@ -83,25 +83,41 @@ jobs: REPO: ${{ github.repository }} run: | set -euo pipefail - # Fetch all open PRs and run gate-check on each - # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN. - # gate_check.py uses timeout=15 on every urlopen call; this catches the - # inline Python polling loop too (issue #603). + # Fetch all open PRs and run gate-check on each. This scheduled + # refresher is advisory; a transient Gitea list timeout must not turn + # main red. PR-specific gate-check runs still use normal failure + # semantics. pr_numbers=$(python3 <<'PY' import json import os import socket + import sys + import time + import urllib.error import urllib.request - socket.setdefaulttimeout(15) + socket.setdefaulttimeout(30) token = os.environ["GITEA_TOKEN"] repo = os.environ["REPO"] - req = urllib.request.Request( - f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100", - headers={"Authorization": f"token {token}", "Accept": "application/json"}, - ) - with urllib.request.urlopen(req) as r: - prs = json.loads(r.read()) + url = f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100" + last_error = None + for attempt in range(1, 4): + req = urllib.request.Request( + url, + headers={"Authorization": f"token {token}", "Accept": "application/json"}, + ) + try: + with urllib.request.urlopen(req, timeout=30) as r: + prs = json.loads(r.read()) + break + except (TimeoutError, OSError, urllib.error.URLError, urllib.error.HTTPError) as exc: + last_error = exc + print(f"warning: PR list fetch attempt {attempt}/3 failed: {exc}", file=sys.stderr) + if attempt < 3: + time.sleep(2 * attempt) + else: + print(f"warning: skipped scheduled gate-check refresh; failed to list open PRs after 3 attempts: {last_error}", file=sys.stderr) + raise SystemExit(0) for pr in prs: print(pr["number"]) PY diff --git a/.gitea/workflows/handlers-postgres-integration.yml b/.gitea/workflows/handlers-postgres-integration.yml index 65203fc3e..b590accf3 100644 --- a/.gitea/workflows/handlers-postgres-integration.yml +++ b/.gitea/workflows/handlers-postgres-integration.yml @@ -86,7 +86,11 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: - fetch-depth: 0 + # A full-history checkout can exceed the runner's quiet/startup + # window before the path filter emits logs. Fetch the common push + # case cheaply; the script below fetches the exact BASE SHA if it is + # not present in the shallow checkout. + fetch-depth: 2 - id: filter # Inline replacement for dorny/paths-filter — see e2e-api.yml. run: | diff --git a/.gitea/workflows/lint-continue-on-error-tracking.yml b/.gitea/workflows/lint-continue-on-error-tracking.yml index cc06bca79..8cb854bde 100644 --- a/.gitea/workflows/lint-continue-on-error-tracking.yml +++ b/.gitea/workflows/lint-continue-on-error-tracking.yml @@ -93,7 +93,7 @@ jobs: lint: name: lint-continue-on-error-tracking runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 20 # Phase 3 (RFC #219 §1): surface masked defects without blocking # PRs. Pre-existing continue-on-error: true directives on main # all violate this lint at first — intentional. Flip to false diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index 9aedadd64..818a4cad7 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -49,13 +49,17 @@ jobs: # bp-exempt: post-merge image publication side effect; CI / all-required gates source changes. build-and-push: name: Build & push canvas image - # REVERTED (infra/revert-docker-runner-label): `runs-on: ubuntu-latest` restored. - # The `docker` label is not registered on any act_runner. `runs-on: [ubuntu-latest, docker]` - # causes jobs to queue indefinitely with zero eligible runners — strictly worse than the - # pre-#599 coin-flip (50% success rate). Once the `docker` label is registered on - # ≥2 runners, re-apply the fix from #599 (infra/docker-runner-label). - # See issue #576 + infra-lead pulse ~00:30Z. - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push:main, canvas/**) — reserved capacity so a merged + # canvas fix's image build never FIFO-queues behind PR required-CI. + # The `publish` label resolves ONLY to the molecule-runner-publish-* + # sub-pool (config.publish.yaml). HARD DEPENDENCY: this MUST land + # AFTER the publish-lane runners are registered/advertising `publish` + # — the earlier #599 `docker` label attempt queued indefinitely with + # zero eligible runners precisely because the label was targeted + # before any runner advertised it (see #576). The lane is registered + # in this rollout (internal#462) so the precondition holds. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/publish-runtime.yml b/.gitea/workflows/publish-runtime.yml index fe46e812f..665ca6bb5 100644 --- a/.gitea/workflows/publish-runtime.yml +++ b/.gitea/workflows/publish-runtime.yml @@ -66,7 +66,10 @@ concurrency: jobs: publish: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push tag runtime-v*) — reserved capacity, never FIFO + # behind PR-CI. `publish` resolves only to molecule-runner-publish-*. + runs-on: publish outputs: version: ${{ steps.version.outputs.version }} wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }} @@ -159,6 +162,7 @@ jobs: exit 1 fi python -m twine upload \ + --verbose \ --repository pypi \ --username __token__ \ --password "$PYPI_TOKEN" \ @@ -166,7 +170,9 @@ jobs: cascade: needs: publish - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — downstream of the runtime + # publish ship job; keep it on the reserved lane too. + runs-on: publish steps: - name: Wait for PyPI to propagate the new version env: diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 02a42962a..3f70ca2b3 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -54,7 +54,14 @@ env: jobs: build-and-push: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). This + # is a post-merge ship job (on: push:main) — it must NOT FIFO-compete + # with PR required-CI on the shared pool (PR#1350's prod image build + # was delayed ~25min this way). The `publish` label resolves ONLY to + # the reserved molecule-runner-publish-* sub-pool (config.publish.yaml, + # OUTSIDE the managed 1..20 range) so a merged fix's image build + # starts immediately while PR-CI keeps the general pool. + runs-on: publish steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -181,7 +188,9 @@ jobs: name: Production auto-deploy needs: build-and-push if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — production deploy of a merged + # fix; reserved capacity, never queued behind PR-CI. + runs-on: publish timeout-minutes: 75 env: CP_URL: ${{ vars.PROD_CP_URL || 'https://api.moleculesai.app' }} diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 259df5562..f458501c0 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -68,7 +68,10 @@ jobs: # bp-exempt: production redeploy is a side-effect workflow, not a merge gate. redeploy: if: ${{ github.event_name == 'workflow_dispatch' }} - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Production tenant redeploy — a deploy action, reserved capacity so + # it never queues behind PR-CI. `publish` -> molecule-runner-publish-*. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml index 98f6b2276..534a977e1 100644 --- a/.gitea/workflows/redeploy-tenants-on-staging.yml +++ b/.gitea/workflows/redeploy-tenants-on-staging.yml @@ -75,7 +75,10 @@ env: jobs: # bp-exempt: post-merge staging redeploy side effect; CI / all-required gates source changes. redeploy: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Post-merge staging redeploy — a deploy action, reserved capacity. + # `publish` -> molecule-runner-publish-* sub-pool. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/review-refire-comments.yml b/.gitea/workflows/review-refire-comments.yml index c799c442a..eb1c6b692 100644 --- a/.gitea/workflows/review-refire-comments.yml +++ b/.gitea/workflows/review-refire-comments.yml @@ -18,6 +18,10 @@ permissions: pull-requests: read statuses: write +concurrency: + group: ${{ github.repository }}-${{ github.workflow }}-${{ github.event.issue.number || github.ref }} + cancel-in-progress: true + jobs: dispatch: runs-on: ubuntu-latest diff --git a/.gitea/workflows/sop-checklist.yml b/.gitea/workflows/sop-checklist.yml index fe86219f2..85ebf50a1 100644 --- a/.gitea/workflows/sop-checklist.yml +++ b/.gitea/workflows/sop-checklist.yml @@ -70,7 +70,7 @@ name: sop-checklist # Cancel any in-progress runs for the same PR to prevent # stale runs from overwriting newer status contexts. concurrency: - group: ${{ github.repository }}-${{ github.event.pull_request.number }} + group: ${{ github.repository }}-${{ github.workflow }}-${{ github.event.pull_request.number || github.event.issue.number || github.ref }} cancel-in-progress: true # bp-required: yes ← emits sop-checklist / all-items-acked (pull_request) diff --git a/.gitea/workflows/sop-tier-check.yml b/.gitea/workflows/sop-tier-check.yml index 235ed6334..1f9eb8889 100644 --- a/.gitea/workflows/sop-tier-check.yml +++ b/.gitea/workflows/sop-tier-check.yml @@ -61,6 +61,10 @@ on: pull_request_review: types: [submitted, dismissed, edited] +concurrency: + group: ${{ github.repository }}-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: tier-check: runs-on: ubuntu-latest diff --git a/.staging-trigger b/.staging-trigger index 270a65607..8878315ce 100644 --- a/.staging-trigger +++ b/.staging-trigger @@ -1 +1 @@ -staging trigger \ No newline at end of file +staging trigger 2026-05-14T17:35:02Z diff --git a/_ci_trigger.txt b/_ci_trigger.txt new file mode 100644 index 000000000..b28fbc7a3 --- /dev/null +++ b/_ci_trigger.txt @@ -0,0 +1 @@ +trigger \ No newline at end of file diff --git a/canvas/playwright.config.ts b/canvas/playwright.config.ts index 2aa027e9c..88c32e0d7 100644 --- a/canvas/playwright.config.ts +++ b/canvas/playwright.config.ts @@ -8,7 +8,7 @@ export default defineConfig({ workers: 1, retries: 0, use: { - baseURL: "http://localhost:3000", + baseURL: process.env.PLAYWRIGHT_BASE_URL || "http://localhost:3000", headless: true, screenshot: "only-on-failure", }, diff --git a/canvas/src/components/__tests__/ThemeToggle.test.tsx b/canvas/src/components/__tests__/ThemeToggle.test.tsx index 4128d3d70..08b875a4b 100644 --- a/canvas/src/components/__tests__/ThemeToggle.test.tsx +++ b/canvas/src/components/__tests__/ThemeToggle.test.tsx @@ -24,8 +24,12 @@ vi.mock("@/lib/theme-provider", () => ({ })), })); +// Wrap cleanup in act() so any pending React state updates (e.g. from +// keyDown handlers that call setTheme) flush before DOM unmount. Without +// this, cleanup() can race against pending renders and cause INDEX_SIZE_ERR +// when the handleKeyDown callback tries to query the DOM mid-teardown. afterEach(() => { - cleanup(); + act(() => { cleanup(); }); vi.clearAllMocks(); }); @@ -146,7 +150,7 @@ describe("ThemeToggle — keyboard navigation (WCAG 2.1.1 / ARIA radiogroup)", ( const radios = screen.getAllByRole("radio"); // dark (index 2) is current; ArrowRight should wrap to light (index 0) act(() => { radios[2].focus(); }); - fireEvent.keyDown(radios[2], { key: "ArrowRight" }); + act(() => { fireEvent.keyDown(radios[2], { key: "ArrowRight" }); }); expect(mockSetTheme).toHaveBeenCalledWith("light"); }); @@ -160,7 +164,7 @@ describe("ThemeToggle — keyboard navigation (WCAG 2.1.1 / ARIA radiogroup)", ( const radios = screen.getAllByRole("radio"); // light (index 0) is current; ArrowLeft should go to dark (index 2) act(() => { radios[0].focus(); }); - fireEvent.keyDown(radios[0], { key: "ArrowLeft" }); + act(() => { fireEvent.keyDown(radios[0], { key: "ArrowLeft" }); }); expect(mockSetTheme).toHaveBeenCalledWith("dark"); }); @@ -174,7 +178,7 @@ describe("ThemeToggle — keyboard navigation (WCAG 2.1.1 / ARIA radiogroup)", ( const radios = screen.getAllByRole("radio"); // light (index 0) is current; ArrowDown should go to system (index 1) act(() => { radios[0].focus(); }); - fireEvent.keyDown(radios[0], { key: "ArrowDown" }); + act(() => { fireEvent.keyDown(radios[0], { key: "ArrowDown" }); }); expect(mockSetTheme).toHaveBeenCalledWith("system"); }); @@ -187,7 +191,7 @@ describe("ThemeToggle — keyboard navigation (WCAG 2.1.1 / ARIA radiogroup)", ( render(); const radios = screen.getAllByRole("radio"); act(() => { radios[2].focus(); }); - fireEvent.keyDown(radios[2], { key: "Home" }); + act(() => { fireEvent.keyDown(radios[2], { key: "Home" }); }); expect(mockSetTheme).toHaveBeenCalledWith("light"); }); @@ -200,14 +204,14 @@ describe("ThemeToggle — keyboard navigation (WCAG 2.1.1 / ARIA radiogroup)", ( render(); const radios = screen.getAllByRole("radio"); act(() => { radios[0].focus(); }); - fireEvent.keyDown(radios[0], { key: "End" }); + act(() => { fireEvent.keyDown(radios[0], { key: "End" }); }); expect(mockSetTheme).toHaveBeenCalledWith("dark"); }); it("does nothing on unrelated keys", () => { render(); const radios = screen.getAllByRole("radio"); - fireEvent.keyDown(radios[0], { key: "Enter" }); + act(() => { fireEvent.keyDown(radios[0], { key: "Enter" }); }); expect(mockSetTheme).not.toHaveBeenCalled(); }); }); diff --git a/canvas/src/components/mobile/MobileSpawn.tsx b/canvas/src/components/mobile/MobileSpawn.tsx index 01c53c7c1..7ee62e89d 100644 --- a/canvas/src/components/mobile/MobileSpawn.tsx +++ b/canvas/src/components/mobile/MobileSpawn.tsx @@ -12,6 +12,7 @@ import { useEffect, useState } from "react"; import { api } from "@/lib/api"; import { type Template } from "@/lib/deploy-preflight"; +import { isSaaSTenant } from "@/lib/tenant"; import { tierCode } from "./palette"; import { MOBILE_FONT_MONO, MOBILE_FONT_SANS, type MobilePalette, usePalette } from "./palette"; @@ -26,6 +27,7 @@ const TIER_LABEL: Record<"T1" | "T2" | "T3" | "T4", string> = { export function MobileSpawn({ dark, onClose }: { dark: boolean; onClose: () => void }) { const p = usePalette(dark); + const isSaaS = isSaaSTenant(); const [templates, setTemplates] = useState([]); const [loadingTemplates, setLoadingTemplates] = useState(true); const [tplId, setTplId] = useState(null); @@ -43,7 +45,7 @@ export function MobileSpawn({ dark, onClose }: { dark: boolean; onClose: () => v setTemplates(list); if (list.length > 0) { setTplId(list[0].id); - setTier(tierCode(list[0].tier)); + setTier(isSaaS ? "T4" : tierCode(list[0].tier)); } }) .catch(() => { @@ -55,7 +57,7 @@ export function MobileSpawn({ dark, onClose }: { dark: boolean; onClose: () => v return () => { cancelled = true; }; - }, []); + }, [isSaaS]); const handleSpawn = async () => { if (busy || !tplId) return; @@ -67,7 +69,7 @@ export function MobileSpawn({ dark, onClose }: { dark: boolean; onClose: () => v await api.post<{ id: string }>("/workspaces", { name: (name.trim() || chosen.name), template: chosen.id, - tier: Number(tier.slice(1)), + tier: isSaaS ? 4 : Number(tier.slice(1)), canvas: { x: Math.random() * 400 + 100, y: Math.random() * 300 + 100, @@ -203,7 +205,7 @@ export function MobileSpawn({ dark, onClose }: { dark: boolean; onClose: () => v > {templates.map((t) => { const on = tplId === t.id; - const tCode = tierCode(t.tier); + const tCode = isSaaS ? "T4" : tierCode(t.tier); return (