Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 16b203fff1 | |||
| c58ffd2828 | |||
| a9bc5e39d5 | |||
| 2ee97c097d | |||
| ee9dc5b9c5 | |||
| 5455ddefe2 | |||
| 80d517b8ab | |||
| dbbd351c70 | |||
| 55fa44571e | |||
| 676f9a033b | |||
| 90467540dd | |||
| 7932bc4c48 | |||
| dd3090c894 | |||
| 6f69c62d5b | |||
| bf3f044786 | |||
| 03cee314ba | |||
| 896afc5bd7 | |||
| a224b26c0f | |||
| f17375a901 | |||
| 6602361bf5 | |||
| 47d24be523 | |||
| 7704afcf90 | |||
| 02e305f6f5 |
+149
-12
@@ -64,11 +64,41 @@ import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import resource
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Any, Callable
|
||||
from typing import Any, Callable, Iterator
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Address-space guardrail (RFC#369 / task #369 follow-up to mc#1242-class OOM).
|
||||
#
|
||||
# `get_issue_comments` paginates the full comment history of a PR. On
|
||||
# bot-relay-heavy PRs (e.g. mc#291, mc#1242) this can balloon past the
|
||||
# runner's cgroup memory limit and 137 the job. Cap virtual-address-space
|
||||
# at 2 GiB so the script OOMs as a `MemoryError` (catchable / surfaceable)
|
||||
# rather than a SIGKILL we can't post a status for.
|
||||
#
|
||||
# 2 GiB is generous — a 5000-comment PR with 1 KiB minimal-dicts (see
|
||||
# get_issue_comments below) fits in ~10 MiB, leaving plenty of headroom
|
||||
# for the Python runtime + urllib + json buffers.
|
||||
#
|
||||
# Skipped under pytest / dry-run where RLIMIT_AS would interfere with
|
||||
# test runner memory needs (set SOP_CHECKLIST_NO_RLIMIT=1 to opt out).
|
||||
if not os.environ.get("SOP_CHECKLIST_NO_RLIMIT"):
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (2 * 1024**3, 2 * 1024**3))
|
||||
except (ValueError, OSError):
|
||||
# macOS sometimes refuses RLIMIT_AS; not fatal — the Linux runner
|
||||
# is the only place this matters for the OOM-prevention goal.
|
||||
pass
|
||||
|
||||
# Per-comment body cap (task #369). The directive parser walks the body
|
||||
# line-by-line looking for ^/sop-ack ^/sop-revoke ^/sop-n/a markers — only
|
||||
# the first few KiB matter for that. Cap each comment body so a single
|
||||
# pasted-log comment can't push us past the cgroup limit.
|
||||
_MAX_BODY_BYTES = int(os.environ.get("SOP_CHECKLIST_MAX_BODY_BYTES") or 8 * 1024)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -460,16 +490,35 @@ class GiteaClient:
|
||||
raise RuntimeError(f"GET pulls/{pr} → HTTP {code}: {data!r}")
|
||||
return data
|
||||
|
||||
def get_issue_comments(
|
||||
self, owner: str, repo: str, issue: int
|
||||
) -> list[dict[str, Any]]:
|
||||
# Paginate. Gitea default page size 50.
|
||||
out: list[dict[str, Any]] = []
|
||||
def iter_issue_comments(
|
||||
self, owner: str, repo: str, issue: int, page_size: int = 50
|
||||
) -> Iterator[dict[str, Any]]:
|
||||
"""Stream comments page-by-page, yielding ONE minimal-dict per comment.
|
||||
|
||||
Each yielded comment carries ONLY the fields the gate actually reads
|
||||
— `{"user": {"login": str}, "body": str}` — and DROPS the much
|
||||
larger Gitea-API extras (html_url, pull_request_url, issue_url,
|
||||
assets, created_at, updated_at, id, original_author_*).
|
||||
|
||||
Memory motivation (task #369 / mc#1242-class OOM): full Gitea
|
||||
comment dicts are ~2 KiB median + ~3 KiB p95. On PRs with several
|
||||
thousand bot-relay comments the eager `list[full_dict]` shape used
|
||||
previously pushed runner anon-rss past the cgroup limit. The
|
||||
minimal-dict shape is ~10-20x smaller (typically ~50-100B Python
|
||||
overhead + the body string).
|
||||
|
||||
The two downstream consumers (`compute_ack_state`,
|
||||
`compute_na_state`) each iterate the comment list exactly once and
|
||||
read only `body` + `user.login`, so dropping every other field is
|
||||
safe. They still receive `list[dict[str, Any]]`-shaped objects so
|
||||
the test fixtures (which already used the minimal shape) keep
|
||||
working with no fixture changes.
|
||||
"""
|
||||
page = 1
|
||||
while True:
|
||||
code, data = self._req(
|
||||
"GET",
|
||||
f"/repos/{owner}/{repo}/issues/{issue}/comments?limit=50&page={page}",
|
||||
f"/repos/{owner}/{repo}/issues/{issue}/comments?limit={page_size}&page={page}",
|
||||
)
|
||||
if code != 200:
|
||||
raise RuntimeError(
|
||||
@@ -477,10 +526,41 @@ class GiteaClient:
|
||||
)
|
||||
if not data:
|
||||
break
|
||||
out.extend(data)
|
||||
if len(data) < 50:
|
||||
for c in data:
|
||||
# Minimal projection — drop ALL fields the gate doesn't read.
|
||||
user_login = ((c.get("user") or {}).get("login") or "") if isinstance(c, dict) else ""
|
||||
body = (c.get("body") if isinstance(c, dict) else "") or ""
|
||||
# Body-size guardrail: huge comments (e.g. pasted CI logs) can
|
||||
# individually be MiBs. The directive parser only needs the
|
||||
# first ~8 KiB to find /sop-ack /sop-revoke /sop-n/a markers
|
||||
# — anything past that is filler. Truncate at 8 KiB so a
|
||||
# single oversized comment can't OOM the runner.
|
||||
if len(body) > _MAX_BODY_BYTES:
|
||||
body = body[:_MAX_BODY_BYTES]
|
||||
yield {"user": {"login": user_login}, "body": body}
|
||||
if len(data) < page_size:
|
||||
break
|
||||
page += 1
|
||||
|
||||
def get_issue_comments(
|
||||
self,
|
||||
owner: str,
|
||||
repo: str,
|
||||
issue: int,
|
||||
max_comments: int | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Paginate + collect minimal comment dicts. See `iter_issue_comments`
|
||||
for the per-comment shape and the OOM-prevention rationale.
|
||||
|
||||
`max_comments` (optional, default unbounded): hard cap. When the cap
|
||||
is hit we stop fetching further pages and the caller surfaces a
|
||||
soft 'skipping due to volume' status (see main()).
|
||||
"""
|
||||
out: list[dict[str, Any]] = []
|
||||
for c in self.iter_issue_comments(owner, repo, issue):
|
||||
out.append(c)
|
||||
if max_comments is not None and len(out) >= max_comments:
|
||||
break
|
||||
return out
|
||||
|
||||
def resolve_team_id(self, org: str, team_name: str) -> int | None:
|
||||
@@ -832,6 +912,17 @@ def main(argv: list[str] | None = None) -> int:
|
||||
"thing BP sees is the POSTed status. Useful for local debugging."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--max-comments",
|
||||
type=int,
|
||||
default=int(os.environ.get("SOP_CHECKLIST_MAX_COMMENTS") or 5000),
|
||||
help=(
|
||||
"Hard cap on comments fetched from the PR. Above this we post "
|
||||
"a SOFT-pending status with a 'skipping due to volume' note "
|
||||
"instead of OOM'ing the runner (task #369). Override with the "
|
||||
"SOP_CHECKLIST_MAX_COMMENTS env var. Set 0 to disable the cap."
|
||||
),
|
||||
)
|
||||
args = p.parse_args(argv)
|
||||
|
||||
token = os.environ.get("GITEA_TOKEN", "")
|
||||
@@ -865,7 +956,18 @@ def main(argv: list[str] | None = None) -> int:
|
||||
print("::error::PR payload missing user.login or head.sha", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
comments = client.get_issue_comments(args.owner, args.repo, args.pr)
|
||||
max_comments_cap = args.max_comments if args.max_comments and args.max_comments > 0 else None
|
||||
comments = client.get_issue_comments(
|
||||
args.owner, args.repo, args.pr, max_comments=max_comments_cap
|
||||
)
|
||||
|
||||
# Volume short-circuit: PRs with thousands of bot-relay comments
|
||||
# (the mc#1242-class OOM source) get a soft 'volume-skipped' status
|
||||
# so the gate doesn't churn the runner; reviewers can re-trigger by
|
||||
# editing the PR or filing a fresh PR with the housekeeping comments
|
||||
# split off. Cap-hit means we couldn't see the WHOLE history, so we
|
||||
# can't fairly post failure — pending is the safe default.
|
||||
volume_skipped = bool(max_comments_cap and len(comments) >= max_comments_cap)
|
||||
|
||||
# High-risk classification (RFC#450 Option C, governance fix for
|
||||
# internal#442). Computed ONCE per PR — used by both the probe
|
||||
@@ -879,8 +981,34 @@ def main(argv: list[str] | None = None) -> int:
|
||||
team_member_cache: dict[tuple[str, int], bool | None] = {}
|
||||
|
||||
def probe(slug: str, users: list[str]) -> list[str]:
|
||||
item = items_by_slug[slug]
|
||||
team_names: list[str] = resolve_required_teams(item, high_risk)
|
||||
# `slug` may be either an items-key (compute_ack_state caller) OR
|
||||
# an n/a-gate key (compute_na_state caller). Previously this hard
|
||||
# KeyError'd on the n/a-gate path when slug was e.g. "security-review"
|
||||
# — that's a config gate, not an item — so the gate would crash
|
||||
# instead of falling back to the gate's own required_teams. Fix
|
||||
# task #369 follow-up to issue #355.
|
||||
if slug in items_by_slug:
|
||||
item = items_by_slug[slug]
|
||||
team_names: list[str] = resolve_required_teams(item, high_risk)
|
||||
elif slug in na_gates:
|
||||
# n/a-gate configs carry `required_teams` directly (see
|
||||
# sop-checklist-config.yaml: n/a_gates.<gate>.required_teams).
|
||||
gate_cfg = na_gates[slug] or {}
|
||||
team_names = list(gate_cfg.get("required_teams") or [])
|
||||
if not team_names:
|
||||
print(
|
||||
f"::warning::n/a-gate '{slug}' has no required_teams; "
|
||||
"fail-closed (no users will be approved)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
# Unknown slug — fail closed, log so we can find config drift.
|
||||
print(
|
||||
f"::warning::probe() called with slug '{slug}' which is "
|
||||
f"neither an items entry nor an n/a-gate; fail-closed",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return []
|
||||
# Resolve names → ids. NOTE: orgs/{org}/teams/search may not be
|
||||
# available — fall back to the list endpoint.
|
||||
team_ids: list[int] = []
|
||||
@@ -938,6 +1066,15 @@ def main(argv: list[str] | None = None) -> int:
|
||||
# were not required (vs a tier:medium+ PR that truly passed all acks).
|
||||
state = "success"
|
||||
description = f"[info tier:low] {description}"
|
||||
if volume_skipped:
|
||||
# Above the comment-cap — we may have a partial view. Soft-pend
|
||||
# so neither BP nor the author gets stuck; surface the cap so
|
||||
# reviewers know what's up. No-block at the gate level.
|
||||
state = "pending"
|
||||
description = (
|
||||
f"[volume-skipped] comment-cap={max_comments_cap} hit; please file "
|
||||
f"a fresh PR with bot-relay history split off (#369). {description}"
|
||||
)
|
||||
|
||||
# Diagnostics to job log.
|
||||
print(
|
||||
|
||||
@@ -815,3 +815,192 @@ class TestHighRiskClassUsesElevatedListInConfig(unittest.TestCase):
|
||||
sop.resolve_required_teams(items[slug], high_risk=True),
|
||||
f"item {slug} should not be affected by risk-class",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_issue_comments — streaming + minimal-dict shape (task #369 / OOM fix)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _FakeReq:
|
||||
"""Stand-in for GiteaClient._req that serves canned pages."""
|
||||
|
||||
def __init__(self, pages):
|
||||
# pages: list[list[dict]]; one page per call, exhausted in order.
|
||||
self._pages = list(pages)
|
||||
self.calls = []
|
||||
|
||||
def __call__(self, method, path, body=None, ok_codes=(200, 201, 204)):
|
||||
self.calls.append((method, path))
|
||||
if not self._pages:
|
||||
return 200, []
|
||||
return 200, self._pages.pop(0)
|
||||
|
||||
|
||||
class TestGetIssueCommentsStreaming(unittest.TestCase):
|
||||
"""Verify the OOM-fix invariants — minimal-dict shape + page break."""
|
||||
|
||||
def _client_with_pages(self, pages):
|
||||
client = sop.GiteaClient("git.example.com", "tok")
|
||||
client._req = _FakeReq(pages) # type: ignore[method-assign]
|
||||
return client
|
||||
|
||||
def test_minimal_dict_shape_drops_large_fields(self):
|
||||
"""get_issue_comments must DROP html_url/assets/timestamps/etc. and
|
||||
keep ONLY {user.login, body} — that's the whole OOM-prevention."""
|
||||
full_page = [
|
||||
{
|
||||
"id": 1234,
|
||||
"html_url": "https://example.com/some-huge-url",
|
||||
"pull_request_url": "https://example.com/some-other-huge-url",
|
||||
"issue_url": "https://example.com/yet-another-url",
|
||||
"user": {"login": "bob", "avatar_url": "x" * 4000, "id": 99},
|
||||
"original_author": "",
|
||||
"original_author_id": 0,
|
||||
"body": "/sop-ack comprehensive-testing\n\nlooks good",
|
||||
"assets": ["x" * 1000, "y" * 1000],
|
||||
"created_at": "2026-05-19T01:02:03Z",
|
||||
"updated_at": "2026-05-19T01:02:03Z",
|
||||
}
|
||||
]
|
||||
client = self._client_with_pages([full_page])
|
||||
out = client.get_issue_comments("o", "r", 1)
|
||||
self.assertEqual(len(out), 1)
|
||||
# Only the two whitelisted keys + nested user.login
|
||||
self.assertEqual(set(out[0].keys()), {"user", "body"})
|
||||
self.assertEqual(set(out[0]["user"].keys()), {"login"})
|
||||
self.assertEqual(out[0]["user"]["login"], "bob")
|
||||
self.assertEqual(out[0]["body"], "/sop-ack comprehensive-testing\n\nlooks good")
|
||||
# Critical: avatar/assets/timestamps/etc. must be gone (~4KB+ each).
|
||||
self.assertNotIn("html_url", out[0])
|
||||
self.assertNotIn("assets", out[0])
|
||||
self.assertNotIn("created_at", out[0])
|
||||
|
||||
def test_pagination_break_on_short_page(self):
|
||||
# Page-size 50; a page of <50 means no more pages.
|
||||
page1 = [{"user": {"login": "u"}, "body": "x"}] * 7
|
||||
client = self._client_with_pages([page1])
|
||||
out = client.get_issue_comments("o", "r", 2)
|
||||
self.assertEqual(len(out), 7)
|
||||
# Should have made exactly 1 _req call (no page-2 probe).
|
||||
self.assertEqual(len(client._req.calls), 1)
|
||||
|
||||
def test_pagination_continues_until_empty(self):
|
||||
# Two full pages + one short page.
|
||||
page1 = [{"user": {"login": "u"}, "body": "x"}] * 50
|
||||
page2 = [{"user": {"login": "u"}, "body": "y"}] * 50
|
||||
page3 = [{"user": {"login": "u"}, "body": "z"}] * 3
|
||||
client = self._client_with_pages([page1, page2, page3])
|
||||
out = client.get_issue_comments("o", "r", 3)
|
||||
self.assertEqual(len(out), 103)
|
||||
self.assertEqual(len(client._req.calls), 3)
|
||||
|
||||
def test_max_comments_caps_collection(self):
|
||||
page1 = [{"user": {"login": "u"}, "body": "x"}] * 50
|
||||
page2 = [{"user": {"login": "u"}, "body": "y"}] * 50
|
||||
page3 = [{"user": {"login": "u"}, "body": "z"}] * 50
|
||||
client = self._client_with_pages([page1, page2, page3])
|
||||
out = client.get_issue_comments("o", "r", 4, max_comments=75)
|
||||
self.assertEqual(len(out), 75)
|
||||
# Stops short: shouldn't have requested page-3.
|
||||
self.assertLessEqual(len(client._req.calls), 2)
|
||||
|
||||
def test_oversized_body_truncated(self):
|
||||
# An individual comment with a multi-MiB body (e.g. pasted CI log)
|
||||
# must NOT pull the whole thing into memory. The directive parser
|
||||
# only needs the first ~8 KiB to find /sop-* markers.
|
||||
huge_body = "/sop-ack comprehensive-testing\n" + ("X" * (4 * 1024 * 1024))
|
||||
page = [{"user": {"login": "bob"}, "body": huge_body}]
|
||||
client = self._client_with_pages([page])
|
||||
out = client.get_issue_comments("o", "r", 99)
|
||||
self.assertEqual(len(out), 1)
|
||||
# Cap is 8 KiB; comment body must be <= 8 KiB after streaming.
|
||||
self.assertLessEqual(len(out[0]["body"]), 8 * 1024)
|
||||
# Marker still discoverable at the start.
|
||||
self.assertTrue(out[0]["body"].startswith("/sop-ack comprehensive-testing"))
|
||||
|
||||
def test_iter_handles_missing_user_or_body(self):
|
||||
# Defensive: Gitea has been seen to return user=null on deleted users.
|
||||
page = [
|
||||
{"user": None, "body": "abandoned-author"},
|
||||
{"user": {"login": "alice"}, "body": None},
|
||||
{"body": "no-user-key"},
|
||||
{"user": {"login": "bob"}, "body": "ok"},
|
||||
]
|
||||
client = self._client_with_pages([page])
|
||||
out = client.get_issue_comments("o", "r", 5)
|
||||
self.assertEqual(len(out), 4)
|
||||
self.assertEqual(out[0]["user"]["login"], "")
|
||||
self.assertEqual(out[0]["body"], "abandoned-author")
|
||||
self.assertEqual(out[1]["user"]["login"], "alice")
|
||||
self.assertEqual(out[1]["body"], "")
|
||||
self.assertEqual(out[2]["user"]["login"], "")
|
||||
self.assertEqual(out[3]["user"]["login"], "bob")
|
||||
|
||||
def test_minimal_dicts_work_with_compute_ack_state(self):
|
||||
"""Round-trip: minimal dicts feed back through compute_ack_state."""
|
||||
page = [{"user": {"login": "bob"}, "body": "/sop-ack comprehensive-testing"}]
|
||||
client = self._client_with_pages([page])
|
||||
comments = client.get_issue_comments("o", "r", 6)
|
||||
items = _items_by_slug()
|
||||
aliases = _numeric_aliases()
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", items, aliases, lambda slug, users: list(users)
|
||||
)
|
||||
self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# probe() na-gate fallback — fix for #355-class KeyError 'security-review'
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestComputeNaStateAcceptsGateNotInItems(unittest.TestCase):
|
||||
"""compute_na_state passes the gate NAME to probe(); when the gate is
|
||||
NOT also an items entry (the common case for `security-review`,
|
||||
`qa-review`), probe must fall back to the gate's own required_teams
|
||||
instead of KeyError'ing on items_by_slug[slug].
|
||||
|
||||
This test exercises the public surface (compute_na_state) rather than
|
||||
the inline `probe` closure, because the closure is built inside main().
|
||||
We simulate the fallback by passing a probe that mirrors the production
|
||||
contract — slug may be either an item OR an n/a-gate key, both are valid.
|
||||
"""
|
||||
|
||||
def test_na_gate_with_required_teams_resolves_without_keyerror(self):
|
||||
na_gates = {
|
||||
"security-review": {
|
||||
"required_teams": ["security", "managers", "ceo"],
|
||||
"description": "security N/A",
|
||||
},
|
||||
}
|
||||
comments = [
|
||||
{"user": {"login": "carol"}, "body": "/sop-n/a security-review docs-only"},
|
||||
]
|
||||
# Probe approves any user in the security team; importantly it does
|
||||
# NOT try items_by_slug[slug] for the gate name.
|
||||
called_with = []
|
||||
|
||||
def probe(slug, users):
|
||||
called_with.append(slug)
|
||||
# production probe accepts gate-name OR item-slug; for this test
|
||||
# we just approve everyone.
|
||||
return list(users)
|
||||
|
||||
na_state = sop.compute_na_state(comments, "alice", na_gates, probe)
|
||||
self.assertTrue(na_state["security-review"]["declared"])
|
||||
self.assertEqual(na_state["security-review"]["decl_ackers"], ["carol"])
|
||||
# probe must have been called with the GATE name, not an item slug.
|
||||
self.assertEqual(called_with, ["security-review"])
|
||||
|
||||
def test_na_gate_self_declaration_rejected(self):
|
||||
# Author cannot self-declare N/A — pre-existing invariant; pin it
|
||||
# so the new probe-fallback doesn't regress this.
|
||||
na_gates = {"security-review": {"required_teams": ["security"]}}
|
||||
comments = [
|
||||
{"user": {"login": "alice"}, "body": "/sop-n/a security-review"},
|
||||
]
|
||||
na_state = sop.compute_na_state(
|
||||
comments, "alice", na_gates, lambda *_: ["alice"]
|
||||
)
|
||||
self.assertFalse(na_state["security-review"]["declared"])
|
||||
|
||||
@@ -319,7 +319,7 @@ jobs:
|
||||
with:
|
||||
node-version: '22'
|
||||
- if: always()
|
||||
run: rm -f package-lock.json && npm install
|
||||
run: npm ci --include=optional --prefer-offline
|
||||
- if: always()
|
||||
run: npm run build
|
||||
- if: always()
|
||||
|
||||
@@ -43,6 +43,18 @@ name: Continuous synthetic E2E (staging)
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every 30 minutes, on :02 and :32. This keeps a recurring SaaS
|
||||
# behavior probe while cutting runner occupancy from this workflow by
|
||||
# roughly two thirds; fast liveness belongs in the lighter smoke/heartbeat
|
||||
# probes, not in a full tenant/workspace synth every 10 minutes.
|
||||
#
|
||||
# Previous cadence was every 10 minutes (:02 :12 :22 :32 :42 :52).
|
||||
# The current operator-host runner pool is the bottleneck, so full
|
||||
# synth E2E is deliberately lower-cadence until it moves to a dedicated
|
||||
# runner host or warm-runtime pool.
|
||||
#
|
||||
# Historical notes from the 10-minute shape:
|
||||
#
|
||||
# Every 10 minutes, on :02 :12 :22 :32 :42 :52. Three constraints:
|
||||
# 1. Stay off the top-of-hour. GitHub Actions scheduler drops
|
||||
# :00 firings under high load (own docs:
|
||||
@@ -66,7 +78,7 @@ on:
|
||||
# fires = ~30 min cadence; closer to the 20-min target than the
|
||||
# current shape and provides a real degradation alarm if drops
|
||||
# get worse.
|
||||
- cron: '2,12,22,32,42,52 * * * *'
|
||||
- cron: '2,32 * * * *'
|
||||
permissions:
|
||||
contents: read
|
||||
# No issue-write here — failures surface as red runs in the workflow
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
name: E2E Chat
|
||||
|
||||
# Comprehensive Playwright E2E for the unified chat stack (desktop
|
||||
# ChatTab + mobile MobileChat). Runs on every PR that touches canvas,
|
||||
# workspace-server, or this workflow file.
|
||||
# ChatTab + mobile MobileChat). Heavy browser execution is intentionally
|
||||
# outside the normal required PR path: PRs run it only after entering the
|
||||
# `merge-queue`, while push/main, nightly, and manual dispatch preserve
|
||||
# coverage without making every PR pay the full runtime/browser cost.
|
||||
#
|
||||
# Architecture:
|
||||
# 1. Ephemeral Postgres + Redis (docker, unique container names)
|
||||
@@ -22,6 +24,11 @@ on:
|
||||
branches: [main, staging]
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
schedule:
|
||||
# Nightly at 09:00 UTC. Keeps coverage for the currently non-required
|
||||
# heavy browser lane without spending runner time on every PR.
|
||||
- cron: '0 9 * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: e2e-chat-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
@@ -50,7 +57,14 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- id: decide
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
QUEUE_LABEL: merge-queue
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "schedule" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "chat=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
|
||||
if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
|
||||
BASE="${{ github.event.pull_request.base.sha }}"
|
||||
@@ -67,9 +81,26 @@ jobs:
|
||||
exit 0
|
||||
fi
|
||||
CHANGED=$(git diff --name-only "$BASE" HEAD)
|
||||
if echo "$CHANGED" | grep -qE '^(canvas/|workspace-server/|\.gitea/workflows/e2e-chat\.yml$)'; then
|
||||
if ! echo "$CHANGED" | grep -qE '^(canvas/|workspace-server/|\.gitea/workflows/e2e-chat\.yml$)'; then
|
||||
echo "chat=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
if [ "${{ github.event_name }}" != "pull_request" ]; then
|
||||
echo "chat=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
authfile=$(mktemp)
|
||||
chmod 600 "$authfile"
|
||||
printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$authfile"
|
||||
labels=$(curl -fsS -K "$authfile" \
|
||||
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels" \
|
||||
| python3 -c 'import json,sys; print("\n".join(label.get("name","") for label in json.load(sys.stdin)))')
|
||||
rm -f "$authfile"
|
||||
if printf '%s\n' "$labels" | grep -qx "$QUEUE_LABEL"; then
|
||||
echo "chat=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "PR is not in merge-queue; skipping heavy E2E Chat for normal PR path."
|
||||
echo "chat=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
@@ -230,7 +261,14 @@ jobs:
|
||||
- name: Install Playwright browsers
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
working-directory: canvas
|
||||
run: npx playwright install --with-deps chromium
|
||||
run: |
|
||||
PREBAKED_PLAYWRIGHT=/ms-playwright
|
||||
if [ -d "${PREBAKED_PLAYWRIGHT}" ] && find "${PREBAKED_PLAYWRIGHT}" -maxdepth 3 -type f -name 'chrome' | grep -q .; then
|
||||
echo "Using prebaked Playwright Chromium from ${PREBAKED_PLAYWRIGHT}"
|
||||
echo "PLAYWRIGHT_BROWSERS_PATH=${PREBAKED_PLAYWRIGHT}" >> "$GITHUB_ENV"
|
||||
exit 0
|
||||
fi
|
||||
npx playwright install --with-deps chromium
|
||||
|
||||
- name: Start canvas dev server (background)
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
|
||||
@@ -16,9 +16,9 @@ name: E2E Staging Canvas (Playwright)
|
||||
# e2e-staging-saas.yml (which tests the API shape) by exercising the
|
||||
# actual browser + canvas bundle against live staging.
|
||||
#
|
||||
# Triggers: push to main/staging or PR touching canvas sources + this workflow,
|
||||
# manual dispatch, and weekly cron to catch browser/runtime drift even
|
||||
# when canvas is quiet.
|
||||
# Triggers: push to main, PR touching canvas sources + this workflow only
|
||||
# after the PR enters `merge-queue`, manual dispatch, and scheduled cron to
|
||||
# catch browser/runtime drift even when canvas is quiet.
|
||||
# Added staging to push/pull_request branches so the auto-promote gate
|
||||
# check (--event push --branch staging) can see a completed run for this
|
||||
# workflow — mirrors what PR #1891 does for e2e-api.yml.
|
||||
@@ -37,9 +37,10 @@ on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
schedule:
|
||||
# Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
|
||||
# Nightly at 08:00 UTC — catches Chrome / Playwright / Next.js
|
||||
# release-note-shaped regressions that don't ride in with a PR.
|
||||
- cron: '0 8 * * 0'
|
||||
- cron: '0 8 * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
# Per-SHA grouping (changed 2026-04-28 from a single global group). The
|
||||
@@ -79,10 +80,13 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- id: decide
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
QUEUE_LABEL: merge-queue
|
||||
# Inline replacement for dorny/paths-filter — see e2e-api.yml.
|
||||
# Cron triggers always run real work (no diff context).
|
||||
# Cron and manual triggers always run real work (no diff context).
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "schedule" ]; then
|
||||
if [ "${{ github.event_name }}" = "schedule" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "canvas=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
@@ -102,9 +106,26 @@ jobs:
|
||||
exit 0
|
||||
fi
|
||||
CHANGED=$(git diff --name-only "$BASE" HEAD)
|
||||
if echo "$CHANGED" | grep -qE '^(canvas/|\.gitea/workflows/e2e-staging-canvas\.yml$)'; then
|
||||
if ! echo "$CHANGED" | grep -qE '^(canvas/|\.gitea/workflows/e2e-staging-canvas\.yml$)'; then
|
||||
echo "canvas=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
if [ "${{ github.event_name }}" != "pull_request" ]; then
|
||||
echo "canvas=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
authfile=$(mktemp)
|
||||
chmod 600 "$authfile"
|
||||
printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$authfile"
|
||||
labels=$(curl -fsS -K "$authfile" \
|
||||
"${{ github.server_url }}/api/v1/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels" \
|
||||
| python3 -c 'import json,sys; print("\n".join(label.get("name","") for label in json.load(sys.stdin)))')
|
||||
rm -f "$authfile"
|
||||
if printf '%s\n' "$labels" | grep -qx "$QUEUE_LABEL"; then
|
||||
echo "canvas=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "PR is not in merge-queue; skipping heavy E2E Staging Canvas for normal PR path."
|
||||
echo "canvas=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
@@ -169,7 +190,14 @@ jobs:
|
||||
- name: Install Playwright browsers
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
timeout-minutes: 10
|
||||
run: npx playwright install --with-deps chromium
|
||||
run: |
|
||||
PREBAKED_PLAYWRIGHT=/ms-playwright
|
||||
if [ -d "${PREBAKED_PLAYWRIGHT}" ] && find "${PREBAKED_PLAYWRIGHT}" -maxdepth 3 -type f -name 'chrome' | grep -q .; then
|
||||
echo "Using prebaked Playwright Chromium from ${PREBAKED_PLAYWRIGHT}"
|
||||
echo "PLAYWRIGHT_BROWSERS_PATH=${PREBAKED_PLAYWRIGHT}" >> "$GITHUB_ENV"
|
||||
exit 0
|
||||
fi
|
||||
npx playwright install --with-deps chromium
|
||||
|
||||
- name: Run staging canvas E2E
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
|
||||
@@ -13,8 +13,12 @@ name: gitea-merge-queue
|
||||
# - add `merge-queue-hold` to pause a queued PR without removing it
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/5 * * * *'
|
||||
# Schedule moved to operator-config:
|
||||
# /etc/cron.d/molecule-core-merge-queue ->
|
||||
# /usr/local/bin/molecule-core-cron-bot.sh merge-queue
|
||||
#
|
||||
# The queue bot still processes one PR per tick, but no longer occupies
|
||||
# one of the shared Actions runners just to poll.
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
|
||||
@@ -28,12 +28,10 @@ on:
|
||||
pull_request:
|
||||
paths:
|
||||
- '.gitea/workflows/**'
|
||||
- '.github/workflows/**'
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- '.gitea/workflows/**'
|
||||
- '.github/workflows/**'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -75,8 +73,11 @@ jobs:
|
||||
fails = []
|
||||
warnings = []
|
||||
|
||||
# Gitea is SSOT for molecule-core CI per task #347 / memory
|
||||
# reference_molecule_core_actions_gitea_only. The legacy
|
||||
# .github/workflows/ tree was deleted in SSOT-Instance-4 (#331).
|
||||
roots = []
|
||||
for root in ('.gitea/workflows', '.github/workflows'):
|
||||
for root in ('.gitea/workflows',):
|
||||
if os.path.isdir(root):
|
||||
roots.append(root)
|
||||
|
||||
|
||||
@@ -42,7 +42,13 @@ permissions:
|
||||
packages: write
|
||||
|
||||
env:
|
||||
IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas
|
||||
# SSOT-Instance-10 (#333): ECR registry triplet (account.dkr.ecr.region.amazonaws.com)
|
||||
# sourced from org/repo var `ECR_REGISTRY` with the current prod-account literal as
|
||||
# bootstrap fallback. When the org var is set, the fallback becomes dead code and
|
||||
# switching accounts/regions is a one-line change at the org level (instead of
|
||||
# touching every workflow). Pattern mirrors `vars.CP_URL || 'literal'` already in
|
||||
# use below in this repo's staging-verify.yml.
|
||||
IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/canvas
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
jobs:
|
||||
|
||||
@@ -141,7 +141,59 @@ jobs:
|
||||
/tmp/smoke/bin/pip install --quiet dist/*.whl
|
||||
/tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
|
||||
|
||||
- name: Publish to PyPI
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
# RFC#596 (2026-05-19): Gitea PyPI registry as PRIMARY, PyPI as
|
||||
# best-effort fallback. Eliminates the SPOF that caused the
|
||||
# 2026-05-19 P0 (PyPI abuse-block #593 + Railway outage #595).
|
||||
#
|
||||
# Order is inverted intentionally:
|
||||
# 1. Gitea FIRST — must succeed (our internal SSOT).
|
||||
# 2. PyPI SECOND — best-effort, non-fatal on failure (courtesy
|
||||
# mirror; our consumers don't depend on it after Phase 4
|
||||
# template Dockerfile updates).
|
||||
#
|
||||
# Endpoint shape (verified live in RFC#596 Phase 5):
|
||||
# POST https://git.moleculesai.app/api/packages/molecule-ai/pypi/
|
||||
# HTTP Basic auth: username = gitea username, password = PAT with
|
||||
# `write:package` scope. Returns 201 Created on success.
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
- name: Publish to Gitea PyPI registry (PRIMARY)
|
||||
id: gitea_publish
|
||||
working-directory: ${{ runner.temp }}/runtime-build
|
||||
env:
|
||||
# MOLECULE_PYPI_GITEA_PUBLISHER_USER: Gitea username for the publisher
|
||||
# persona (must own a token with `write:package` scope).
|
||||
# Provisioned in RFC#596 Phase 3 (operator-config PR).
|
||||
# NOTE: secret name MUST NOT start with `GITEA_` or `GITHUB_` —
|
||||
# Gitea 1.22.6 reserves those prefixes for built-in env vars and
|
||||
# rejects repo-secret PUT with HTTP 400 / "invalid secret name".
|
||||
# Empirically reproduced 2026-05-19 against
|
||||
# `/repos/molecule-ai/molecule-core/actions/secrets/GITEA_*`.
|
||||
MOLECULE_PYPI_GITEA_PUBLISHER_USER: ${{ secrets.MOLECULE_PYPI_GITEA_PUBLISHER_USER }}
|
||||
# MOLECULE_PYPI_GITEA_PUBLISHER_TOKEN: PAT for the publisher persona,
|
||||
# `write:package` scope on molecule-ai org.
|
||||
# Synced from Infisical /ci/gitea-pypi-publisher (RFC#596 Phase 3).
|
||||
MOLECULE_PYPI_GITEA_PUBLISHER_TOKEN: ${{ secrets.MOLECULE_PYPI_GITEA_PUBLISHER_TOKEN }}
|
||||
run: |
|
||||
set -eu
|
||||
if [ -z "${MOLECULE_PYPI_GITEA_PUBLISHER_TOKEN:-}" ] || [ -z "${MOLECULE_PYPI_GITEA_PUBLISHER_USER:-}" ]; then
|
||||
echo "::error::MOLECULE_PYPI_GITEA_PUBLISHER_USER / MOLECULE_PYPI_GITEA_PUBLISHER_TOKEN secrets are not set."
|
||||
echo "::error::Provision them via the RFC#596 Phase 3 operator-config sync script."
|
||||
echo "::error::Gitea is the PRIMARY index per RFC#596 — publish job aborts here, NOT after PyPI."
|
||||
exit 1
|
||||
fi
|
||||
python -m twine upload \
|
||||
--verbose \
|
||||
--repository-url "https://git.moleculesai.app/api/packages/molecule-ai/pypi/" \
|
||||
--username "$MOLECULE_PYPI_GITEA_PUBLISHER_USER" \
|
||||
--password "$MOLECULE_PYPI_GITEA_PUBLISHER_TOKEN" \
|
||||
dist/*
|
||||
echo "gitea_status=success" >> "$GITHUB_OUTPUT"
|
||||
echo "gitea_url=https://git.moleculesai.app/api/packages/molecule-ai/pypi/simple/molecule-ai-workspace-runtime" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Publish to PyPI (FALLBACK, best-effort)
|
||||
id: pypi_publish
|
||||
# working-directory matches the preceding Build/Verify steps. Without
|
||||
# this, twine runs from the default workspace checkout dir where
|
||||
# `dist/` doesn't exist and fails with:
|
||||
@@ -149,6 +201,11 @@ jobs:
|
||||
# Caught on the first-ever successful dispatch of this workflow
|
||||
# (run 5097, 2026-05-11 02:08Z) — every other step in the publish
|
||||
# job already had this working-directory; Publish was missing it.
|
||||
#
|
||||
# RFC#596: this step is `continue-on-error: true` because PyPI is
|
||||
# NO LONGER the primary index. PyPI 403/timeout/abuse-block does
|
||||
# NOT block the publish — Gitea already has the wheel.
|
||||
continue-on-error: true
|
||||
working-directory: ${{ runner.temp }}/runtime-build
|
||||
env:
|
||||
# PYPI_TOKEN: repository secret scoped to molecule-ai-workspace-runtime.
|
||||
@@ -157,16 +214,42 @@ jobs:
|
||||
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
if [ -z "$PYPI_TOKEN" ]; then
|
||||
echo "::error::PYPI_TOKEN secret is not set — set it at Settings → Actions → Variables and Secrets → New Secret."
|
||||
echo "::error::Required format: pypi-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
||||
exit 1
|
||||
echo "::warning::PYPI_TOKEN secret is not set — skipping PyPI mirror publish (non-fatal per RFC#596)."
|
||||
echo "pypi_status=skipped_no_token" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
python -m twine upload \
|
||||
if python -m twine upload \
|
||||
--verbose \
|
||||
--repository pypi \
|
||||
--username __token__ \
|
||||
--password "$PYPI_TOKEN" \
|
||||
dist/*
|
||||
dist/*; then
|
||||
echo "pypi_status=success" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
rc=$?
|
||||
echo "::warning::PyPI mirror publish failed (exit $rc). Non-fatal per RFC#596 — Gitea has the wheel."
|
||||
echo "pypi_status=failed_exit_$rc" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
echo "pypi_url=https://pypi.org/project/molecule-ai-workspace-runtime/${{ steps.version.outputs.version }}/" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Publish job summary (Gitea + PyPI status)
|
||||
if: always()
|
||||
run: |
|
||||
{
|
||||
echo "## publish-runtime $(date -u +%FT%TZ)"
|
||||
echo
|
||||
echo "**Version:** \`${{ steps.version.outputs.version }}\`"
|
||||
echo "**Wheel SHA256:** \`${{ steps.wheel_hash.outputs.wheel_sha256 }}\`"
|
||||
echo
|
||||
echo "### Indexes"
|
||||
echo
|
||||
echo "| Index | Status | URL |"
|
||||
echo "|---------|-------------------------------------------------|-----|"
|
||||
echo "| Gitea (PRIMARY) | ${{ steps.gitea_publish.outputs.gitea_status || 'failed' }} | ${{ steps.gitea_publish.outputs.gitea_url || '—' }} |"
|
||||
echo "| PyPI (fallback) | ${{ steps.pypi_publish.outputs.pypi_status || 'failed' }} | ${{ steps.pypi_publish.outputs.pypi_url || '—' }} |"
|
||||
echo
|
||||
echo "Per RFC#596: Gitea is the contract. PyPI is best-effort."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
cascade:
|
||||
needs: publish
|
||||
@@ -184,6 +267,12 @@ jobs:
|
||||
echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
|
||||
exit 1
|
||||
fi
|
||||
# NOTE (RFC#596 follow-up): this propagation probe still resolves
|
||||
# against PyPI's default index. After RFC#596 Phase 4 lands and
|
||||
# consumers pull from Gitea first, this probe should be rewritten
|
||||
# to verify the Gitea simple/ endpoint serves the new wheel
|
||||
# (PyPI may be best-effort-failed and the cascade should still
|
||||
# fan out, since templates will pull from Gitea). Tracked in #596.
|
||||
python -m venv /tmp/propagation-probe
|
||||
PROBE=/tmp/propagation-probe/bin
|
||||
$PROBE/pip install --upgrade --quiet pip
|
||||
@@ -267,7 +356,10 @@ jobs:
|
||||
fi
|
||||
|
||||
GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
|
||||
TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
|
||||
# Keep in lockstep with manifest.json workspace_templates (suffix-stripped).
|
||||
# Guarded by scripts/check-cascade-list-vs-manifest.sh (cascade-list-drift-gate).
|
||||
# 2026-05-19: pruned crewai/deepagents/gemini-cli — not in manifest.
|
||||
TEMPLATES="claude-code hermes openclaw codex langgraph autogen"
|
||||
FAILED=""
|
||||
SKIPPED=""
|
||||
|
||||
|
||||
@@ -43,14 +43,28 @@ on:
|
||||
# `cancel-in-progress: false`; that is not acceptable for a workflow with a
|
||||
# production deploy job. Per-SHA image tags are immutable, and staging-latest is
|
||||
# best-effort last-writer-wins metadata.
|
||||
#
|
||||
# 2026-05-20 retrigger: run #86994 on mc#1589 merge sha 0f0f1ba2 failed at
|
||||
# setup-buildx-action with EACCES on PC2 WSL publish runner — the runner's
|
||||
# DOCKER_CONFIG=/home/hongming/.docker-ecr/ dir didn't have a buildx/certs
|
||||
# subdir writable by the container's UID 1001. Hot-patched the dir perms;
|
||||
# this chore push retriggers the workflow. Proper fix (per-runner
|
||||
# DOCKER_CONFIG owned by 1001, internal#597 --env HOME=/home/runner pattern)
|
||||
# is tracked as a CI-hygiene follow-up — not in scope here.
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
env:
|
||||
IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
|
||||
TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
|
||||
# SSOT-Instance-10 (#333): ECR registry triplet (account.dkr.ecr.region.amazonaws.com)
|
||||
# sourced from org/repo var `ECR_REGISTRY` with the current prod-account literal as
|
||||
# bootstrap fallback. When the org var is set, the fallback becomes dead code and
|
||||
# switching accounts/regions is a one-line change at the org level (instead of
|
||||
# touching every workflow). Pattern mirrors `vars.CP_URL || 'literal'` already in
|
||||
# use below in this repo's staging-verify.yml.
|
||||
IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform
|
||||
TENANT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
|
||||
@@ -75,8 +75,12 @@ permissions:
|
||||
env:
|
||||
# ECR registry (post-2026-05-06 SSOT for tenant images).
|
||||
# publish-workspace-server-image.yml pushes here.
|
||||
IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
|
||||
TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
|
||||
# SSOT-Instance-10 (#333): triplet sourced from org/repo var `ECR_REGISTRY` with
|
||||
# the current prod-account literal as bootstrap fallback. When the org var is set,
|
||||
# the fallback becomes dead code and switching accounts/regions is a one-line
|
||||
# change at the org level. Pattern mirrors `vars.CP_URL || 'literal'` below.
|
||||
IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform
|
||||
TENANT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant
|
||||
# CP endpoint for redeploy-fleet (used in promote step below).
|
||||
CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
@@ -53,19 +53,12 @@ name: status-reaper
|
||||
# `inputs:` block here. Gitea 1.22.6 rejects the whole workflow as
|
||||
# "unknown on type" when `workflow_dispatch.inputs.X` is present.
|
||||
on:
|
||||
# SCHEDULE RE-ENABLED 2026-05-12 rev3 — interim disable (mc#645) reverted now that
|
||||
# rev3 widens DEFAULT_SWEEP_LIMIT 10 → 30 (covers retroactive-failure timing window).
|
||||
# Sibling watchdog re-enabled in the same PR with timeout-minutes raised 5 → 15.
|
||||
schedule:
|
||||
# Every 5 minutes. Off-zero alignment with sibling cron workflows:
|
||||
# ci-required-drift (`:17`), main-red-watchdog (`:05`),
|
||||
# railway-pin-audit (`:23`). 5-min cadence gives a tight enough
|
||||
# close on schedule-triggered false-reds that main-red-watchdog
|
||||
# (hourly :05) almost never files an issue on the false case.
|
||||
# rev3 keeps `*/5` unchanged per hongming-pc2 03:25Z review:
|
||||
# "trades window-width-cheap for cadence-loady" — N=30 widens
|
||||
# the lookback cheaply without doubling runner load via `*/2`.
|
||||
- cron: '*/5 * * * *'
|
||||
# Schedule moved to operator-config:
|
||||
# /etc/cron.d/molecule-core-status-reaper ->
|
||||
# /usr/local/bin/molecule-core-cron-bot.sh status-reaper
|
||||
#
|
||||
# This keeps the 5-minute compensation cadence but stops a maintenance
|
||||
# bot from consuming Gitea Actions runner slots during PR merge waves.
|
||||
workflow_dispatch:
|
||||
|
||||
# Compensating-status POST needs write on repo statuses; no other
|
||||
|
||||
@@ -1,154 +0,0 @@
|
||||
name: Block internal-flavored paths
|
||||
|
||||
# Hard CI gate. Internal content (positioning, competitive briefs, sales
|
||||
# playbooks, PMM/press drip, draft campaigns) lives in molecule-ai/internal —
|
||||
# this public monorepo must never re-acquire those paths. CEO directive
|
||||
# 2026-04-23 after a fleet-wide audit found 79 internal files leaked here.
|
||||
#
|
||||
# Failure mode without this gate: agents (PMM, Research, DevRel, Sales) drop
|
||||
# briefs into the easiest path their cwd resolves to (root /research,
|
||||
# /marketing, /docs/marketing) and gitignore alone won't catch a `git add -f`
|
||||
# or a stale gitignore line. This workflow is the mechanical backstop.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
push:
|
||||
branches: [main, staging]
|
||||
# Required for GitHub merge queue: the queue's pre-merge CI run on
|
||||
# `gh-readonly-queue/...` refs needs this check to fire so the queue
|
||||
# gets a real result instead of stalling forever AWAITING_CHECKS.
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
jobs:
|
||||
check:
|
||||
name: Block forbidden paths
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 2 # need previous commit to diff against on push events
|
||||
|
||||
# For pull_request events the diff base is github.event.pull_request.base.sha,
|
||||
# which may be many commits behind HEAD and therefore absent from the
|
||||
# shallow clone above. Fetch it explicitly (depth=1 keeps it fast).
|
||||
- name: Fetch PR base SHA (pull_request events only)
|
||||
if: github.event_name == 'pull_request'
|
||||
run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
|
||||
|
||||
# For merge_group events the queue's pre-merge ref is a commit on
|
||||
# `gh-readonly-queue/...` whose parent is the queue's base_sha.
|
||||
# That parent isn't part of the queue branch's shallow clone, so
|
||||
# we fetch it explicitly. Mirrors the equivalent step in
|
||||
# secret-scan.yml (#2120) — same shallow-clone bug class.
|
||||
- name: Fetch merge_group base SHA (merge_group events only)
|
||||
if: github.event_name == 'merge_group'
|
||||
run: git fetch --depth=1 origin ${{ github.event.merge_group.base_sha }}
|
||||
|
||||
- name: Refuse if forbidden paths appear
|
||||
env:
|
||||
# Plumb event-specific SHAs through env so the script doesn't
|
||||
# need conditional `${{ ... }}` interpolation per event type.
|
||||
# github.event.before/after only exist on push events;
|
||||
# merge_group has its own base_sha/head_sha; pull_request has
|
||||
# pull_request.base.sha / pull_request.head.sha.
|
||||
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
|
||||
MG_BASE_SHA: ${{ github.event.merge_group.base_sha }}
|
||||
MG_HEAD_SHA: ${{ github.event.merge_group.head_sha }}
|
||||
PUSH_BEFORE: ${{ github.event.before }}
|
||||
PUSH_AFTER: ${{ github.event.after }}
|
||||
run: |
|
||||
# Paths that must NEVER live in the public monorepo. Add to this
|
||||
# list narrowly — broader patterns belong in .gitignore so day-to-day
|
||||
# docs work isn't accidentally blocked.
|
||||
FORBIDDEN_PATTERNS=(
|
||||
"^research/"
|
||||
"^marketing/"
|
||||
"^docs/marketing/"
|
||||
"^comment-[0-9]+\.json$"
|
||||
"^test-pmm.*\.(txt|md)$"
|
||||
"^tick-reflections.*\.(txt|md)$"
|
||||
".*-temp\.(md|txt)$"
|
||||
)
|
||||
|
||||
# Determine the diff base. Each event type stores its SHAs in
|
||||
# a different place — see the env block above.
|
||||
case "${{ github.event_name }}" in
|
||||
pull_request)
|
||||
BASE="$PR_BASE_SHA"
|
||||
HEAD="$PR_HEAD_SHA"
|
||||
;;
|
||||
merge_group)
|
||||
BASE="$MG_BASE_SHA"
|
||||
HEAD="$MG_HEAD_SHA"
|
||||
;;
|
||||
*)
|
||||
BASE="$PUSH_BEFORE"
|
||||
HEAD="$PUSH_AFTER"
|
||||
;;
|
||||
esac
|
||||
|
||||
# On push events with shallow clones, BASE may be present in
|
||||
# the event payload but absent from the local object DB
|
||||
# (fetch-depth=2 doesn't always reach the previous commit
|
||||
# across true merges). Try fetching it on demand. If the
|
||||
# fetch fails — e.g. the SHA was force-overwritten — we fall
|
||||
# through to the empty-BASE branch below, which scans the
|
||||
# entire tree as if every file were new. Correct, just slow.
|
||||
# Same recovery shape as secret-scan.yml (#2120 — incident
|
||||
# 2026-04-27 06:50Z block-internal-paths exit 128 with
|
||||
# "fatal: bad object <sha>" on staging push).
|
||||
if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
|
||||
if ! git cat-file -e "$BASE" 2>/dev/null; then
|
||||
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# Files added or modified in this change.
|
||||
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
|
||||
# New branch / no previous SHA / BASE unreachable — check
|
||||
# the entire tree as if every file were new. Slower but
|
||||
# correct on first push or post-fetch-failure recovery.
|
||||
CHANGED=$(git ls-tree -r --name-only HEAD)
|
||||
else
|
||||
CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
|
||||
fi
|
||||
|
||||
if [ -z "$CHANGED" ]; then
|
||||
echo "No changed files to inspect."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
OFFENDING=""
|
||||
for path in $CHANGED; do
|
||||
for pattern in "${FORBIDDEN_PATTERNS[@]}"; do
|
||||
if echo "$path" | grep -qE "$pattern"; then
|
||||
OFFENDING="${OFFENDING}${path} (matched: ${pattern})\n"
|
||||
break
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [ -n "$OFFENDING" ]; then
|
||||
echo "::error::Forbidden internal-flavored paths detected:"
|
||||
printf "$OFFENDING"
|
||||
echo ""
|
||||
echo "These paths belong in molecule-ai/internal, not this public repo."
|
||||
echo "See docs/internal-content-policy.md for canonical locations."
|
||||
echo ""
|
||||
echo "If your file is genuinely public-facing (e.g. a blog post"
|
||||
echo "ready to ship), use one of these alternatives instead:"
|
||||
echo " • Public-bound blog posts: docs/blog/<slug>.md"
|
||||
echo " • Public-bound tutorials: docs/tutorials/<slug>.md"
|
||||
echo " • Public devrel content: docs/devrel/<slug>.md"
|
||||
echo ""
|
||||
echo "If you legitimately need to add a new top-level path that"
|
||||
echo "happens to match a forbidden pattern, edit"
|
||||
echo ".github/workflows/block-internal-paths.yml and update the"
|
||||
echo "FORBIDDEN_PATTERNS list with reviewer signoff."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ No forbidden paths in this change."
|
||||
@@ -1,320 +0,0 @@
|
||||
name: Canary — staging SaaS smoke (every 30 min)
|
||||
|
||||
# Minimum viable health check: provisions one Hermes workspace on a fresh
|
||||
# staging org, sends one A2A message, verifies PONG, tears down. ~8 min
|
||||
# wall clock. Pages on failure by opening a GitHub issue; auto-closes the
|
||||
# issue on the next green run.
|
||||
#
|
||||
# The full-SaaS workflow (e2e-staging-saas.yml) covers the broader surface
|
||||
# but runs only on provisioning-critical pushes + nightly — this one
|
||||
# catches drift in the 30-min window between those runs (AMI health, CF
|
||||
# cert rotation, WorkOS session stability, etc.).
|
||||
#
|
||||
# Lean mode: E2E_MODE=canary skips the child workspace + HMA memory +
|
||||
# peers/activity checks. One parent workspace + one A2A turn is enough
|
||||
# to signal "SaaS stack end-to-end is alive."
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every 30 min. Cron on GitHub-hosted runners has a known drift of
|
||||
# a few minutes under load — that's fine for a canary.
|
||||
- cron: '*/30 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
keep_on_failure:
|
||||
description: >-
|
||||
Skip teardown when the canary fails (debugging only). The
|
||||
tenant org + EC2 + CF tunnel + DNS stay alive so an operator
|
||||
can SSM into the workspace EC2 and capture docker logs of the
|
||||
failing claude-code container. REMEMBER to manually delete
|
||||
via DELETE /cp/admin/tenants/<slug> when done so the org
|
||||
doesn't accumulate cost. Only honored on workflow_dispatch;
|
||||
cron runs always tear down (we don't want unattended cron
|
||||
to leak resources).
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
# Serialise with the full-SaaS workflow so they don't contend for the
|
||||
# same org-create quota on staging. Different group key from
|
||||
# e2e-staging-saas since we don't mind queueing canaries behind one
|
||||
# full run, but two canaries SHOULD queue against each other.
|
||||
concurrency:
|
||||
group: canary-staging
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
# Needed to open / close the alerting issue.
|
||||
issues: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
canary:
|
||||
name: Canary smoke
|
||||
runs-on: ubuntu-latest
|
||||
# 25 min headroom over the 15-min TLS-readiness deadline in
|
||||
# tests/e2e/test_staging_full_saas.sh (#2107). Without the buffer
|
||||
# the job is killed at the wall-clock 15:00 mark BEFORE the bash
|
||||
# `fail` + diagnostic burst can fire, leaving every cancellation
|
||||
# silent. Sibling staging E2E jobs run at 20-45 min — keeping
|
||||
# canary tighter than them so a true wedge still surfaces here
|
||||
# first.
|
||||
timeout-minutes: 25
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
# MiniMax is the canary's PRIMARY LLM auth path post-2026-05-04.
|
||||
# Switched from hermes+OpenAI after #2578 (the staging OpenAI key
|
||||
# account went over quota and stayed dead for 36+ hours, taking
|
||||
# the canary red the entire time). claude-code template's
|
||||
# `minimax` provider routes ANTHROPIC_BASE_URL to
|
||||
# api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot —
|
||||
# ~5-10x cheaper per token than gpt-4.1-mini AND on a separate
|
||||
# billing account, so OpenAI quota collapse no longer wedges the
|
||||
# canary. Mirrors the migration continuous-synth-e2e.yml made on
|
||||
# 2026-05-03 (#265) for the same reason. tests/e2e/test_staging_
|
||||
# full_saas.sh branches SECRETS_JSON on which key is present —
|
||||
# MiniMax wins when set.
|
||||
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
|
||||
# Direct-Anthropic alternative for operators who don't want to
|
||||
# set up a MiniMax account (priority below MiniMax — first
|
||||
# non-empty wins in test_staging_full_saas.sh's secrets-injection
|
||||
# block). See #2578 PR comment for the rationale.
|
||||
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
|
||||
# OpenAI fallback — kept wired so an operator-dispatched run with
|
||||
# E2E_RUNTIME=hermes overridden via workflow_dispatch can still
|
||||
# exercise the OpenAI path without re-editing the workflow.
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
|
||||
E2E_MODE: canary
|
||||
E2E_RUNTIME: claude-code
|
||||
# Pin the canary to a specific MiniMax model rather than relying
|
||||
# on the per-runtime default (which could resolve to "sonnet" →
|
||||
# direct Anthropic and defeat the cost saving). M2.7-highspeed
|
||||
# is "Token Plan only" but cheap-per-token and fast.
|
||||
E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
|
||||
E2E_RUN_ID: "canary-${{ github.run_id }}"
|
||||
# Debug-only: when an operator dispatches with keep_on_failure=true,
|
||||
# the canary script's E2E_KEEP_ORG=1 path skips teardown so the
|
||||
# tenant org + EC2 stay alive for SSM-based log capture. Cron runs
|
||||
# never set this (the input only exists on workflow_dispatch) so
|
||||
# unattended cron always tears down. See molecule-core#129
|
||||
# failure mode #1 — capturing the actual exception requires
|
||||
# docker logs from the live container.
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Verify LLM key present
|
||||
run: |
|
||||
# Per-runtime key check — claude-code uses MiniMax; hermes /
|
||||
# langgraph (operator-dispatched only) use OpenAI. Hard-fail
|
||||
# rather than soft-skip per the lesson from synth E2E #2578:
|
||||
# an empty key silently falls through to the wrong
|
||||
# SECRETS_JSON branch and the canary fails 5 min later with
|
||||
# a confusing auth error instead of the clean "secret
|
||||
# missing" message at the top.
|
||||
case "${E2E_RUNTIME}" in
|
||||
claude-code)
|
||||
# Either MiniMax OR direct-Anthropic works — first
|
||||
# non-empty wins in the test script's secrets-injection
|
||||
# priority chain. Operators only need to set ONE of these
|
||||
# secrets; we don't force a choice between them.
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
|
||||
required_secret_value="${E2E_MINIMAX_API_KEY}"
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
|
||||
required_secret_value="${E2E_ANTHROPIC_API_KEY}"
|
||||
else
|
||||
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
|
||||
required_secret_value=""
|
||||
fi
|
||||
;;
|
||||
langgraph|hermes)
|
||||
required_secret_name="MOLECULE_STAGING_OPENAI_KEY"
|
||||
required_secret_value="${E2E_OPENAI_API_KEY:-}"
|
||||
;;
|
||||
*)
|
||||
echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
|
||||
required_secret_name=""
|
||||
required_secret_value="present"
|
||||
;;
|
||||
esac
|
||||
if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
|
||||
echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — A2A will fail at request time with 'No LLM provider configured'"
|
||||
exit 2
|
||||
fi
|
||||
echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
|
||||
|
||||
- name: Canary run
|
||||
id: canary
|
||||
run: bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
# Alerting: open a sticky issue on the FIRST failure; comment on
|
||||
# subsequent failures; auto-close on next green. Comment-on-existing
|
||||
# de-duplicates so a single open issue accumulates the streak —
|
||||
# ops sees one issue with N comments rather than N issues.
|
||||
#
|
||||
# Why no consecutive-failures threshold (e.g., wait 3 runs before
|
||||
# filing): the prior threshold check used
|
||||
# `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does
|
||||
# not expose (returns 404). On Gitea Actions the threshold call
|
||||
# ALWAYS failed, breaking the entire alerting step and going days
|
||||
# silent on real regressions (38h+ chronic red on 2026-05-07/08
|
||||
# before this fix; tracked in molecule-core#129). Filing on first
|
||||
# failure is also better UX — we want to know about the first red,
|
||||
# not wait 90 min for it to "count." Real flakes get one issue +
|
||||
# a quick close-on-green; persistent reds accumulate comments.
|
||||
- name: Open issue on failure
|
||||
if: failure()
|
||||
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
||||
with:
|
||||
script: |
|
||||
const title = '🔴 Canary failing: staging SaaS smoke';
|
||||
const runURL = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
||||
|
||||
// Find an existing open canary issue (stable title match).
|
||||
// If one exists, this isn't a "first failure" — comment and exit.
|
||||
const { data: existing } = await github.rest.issues.listForRepo({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
state: 'open', labels: 'canary-staging',
|
||||
per_page: 10,
|
||||
});
|
||||
const match = existing.find(i => i.title === title);
|
||||
if (match) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
issue_number: match.number,
|
||||
body: `Canary still failing. ${runURL}`,
|
||||
});
|
||||
core.info(`Commented on existing issue #${match.number}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// No open issue yet — file one on this first failure. The
|
||||
// comment-on-existing branch above means subsequent failures
|
||||
// accumulate as comments on this same issue, so we don't
|
||||
// spam new issues per run.
|
||||
const body =
|
||||
`Canary run failed at ${new Date().toISOString()}.\n\n` +
|
||||
`Run: ${runURL}\n\n` +
|
||||
`This issue auto-closes on the next green canary run. ` +
|
||||
`Consecutive failures add a comment here rather than a new issue.`;
|
||||
await github.rest.issues.create({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
title, body,
|
||||
labels: ['canary-staging', 'bug'],
|
||||
});
|
||||
core.info('Opened canary failure issue (first red)');
|
||||
|
||||
- name: Auto-close canary issue on success
|
||||
if: success()
|
||||
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
||||
with:
|
||||
script: |
|
||||
const title = '🔴 Canary failing: staging SaaS smoke';
|
||||
const { data: open } = await github.rest.issues.listForRepo({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
state: 'open', labels: 'canary-staging',
|
||||
per_page: 10,
|
||||
});
|
||||
const match = open.find(i => i.title === title);
|
||||
if (match) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
issue_number: match.number,
|
||||
body: `Canary recovered at ${new Date().toISOString()}. Closing.`,
|
||||
});
|
||||
await github.rest.issues.update({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
issue_number: match.number,
|
||||
state: 'closed',
|
||||
});
|
||||
core.info(`Closed recovered canary issue #${match.number}`);
|
||||
}
|
||||
|
||||
- name: Teardown safety net
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
# Slug prefix matches what test_staging_full_saas.sh emits
|
||||
# in canary mode:
|
||||
# SLUG="e2e-canary-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
|
||||
# Earlier this was `e2e-{today}-canary-` — that was the
|
||||
# full-mode pattern (date FIRST, mode SECOND); canary slugs
|
||||
# have mode FIRST, date SECOND. The mismatch silently
|
||||
# never matched, leaving every cancelled-canary EC2 alive
|
||||
# until the once-an-hour sweep eventually caught it
|
||||
# (incident 2026-04-26 21:03Z: 1h25m EC2 leak before manual
|
||||
# cleanup; same gap on three earlier cancellations today).
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, os, datetime
|
||||
run_id = os.environ.get('GITHUB_RUN_ID', '')
|
||||
d = json.load(sys.stdin)
|
||||
# Scope to slugs from THIS canary run when GITHUB_RUN_ID is
|
||||
# available; the canary workflow sets E2E_RUN_ID='canary-\${run_id}'
|
||||
# so the slug suffix is '-canary-\${run_id}-...'. Mirrors the
|
||||
# full-mode safety net's per-run scoping (e2e-staging-saas.yml)
|
||||
# added after the 2026-04-21 cross-run cleanup incident.
|
||||
# Sweep both today AND yesterday's UTC dates so a run that
|
||||
# crosses midnight still cleans up its own slug — see the
|
||||
# 2026-04-26→27 canvas-safety-net incident.
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
|
||||
if run_id:
|
||||
prefixes = tuple(f'e2e-canary-{d}-canary-{run_id}' for d in dates)
|
||||
else:
|
||||
prefixes = tuple(f'e2e-canary-{d}-' for d in dates)
|
||||
candidates = [o['slug'] for o in d.get('orgs', [])
|
||||
if any(o.get('slug','').startswith(p) for p in prefixes)
|
||||
and o.get('status') not in ('purged',)]
|
||||
print('\n'.join(candidates))
|
||||
" 2>/dev/null)
|
||||
# Per-slug DELETE with HTTP-code verification. The previous
|
||||
# `... >/dev/null || true` swallowed every failure, so a 5xx
|
||||
# or timeout from CP looked identical to "successfully cleaned
|
||||
# up" and the tenant kept eating ~2 vCPU until the hourly
|
||||
# stale sweep caught it (up to 2h later). Now we capture the
|
||||
# response code and surface non-2xx as a workflow warning, so
|
||||
# the run page shows which slug leaked. We still don't `exit 1`
|
||||
# on cleanup failure — a single-canary cleanup miss shouldn't
|
||||
# fail-flag the canary itself when the actual smoke check
|
||||
# passed. The sweep-stale-e2e-orgs cron (now every 15 min,
|
||||
# 30-min threshold) is the safety net for whatever slips past.
|
||||
# See molecule-controlplane#420.
|
||||
leaks=()
|
||||
for slug in $orgs; do
|
||||
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
|
||||
# pollution of the captured status (lint-curl-status-capture.yml).
|
||||
set +e
|
||||
curl -sS -o /tmp/canary-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/canary-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/canary-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::canary teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canary-cleanup.out 2>/dev/null)"
|
||||
leaks+=("$slug")
|
||||
fi
|
||||
done
|
||||
if [ ${#leaks[@]} -gt 0 ]; then
|
||||
echo "::warning::canary teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
exit 0
|
||||
@@ -1,39 +0,0 @@
|
||||
name: cascade-list-drift-gate
|
||||
|
||||
# Structural gate: TEMPLATES list in publish-runtime.yml must match
|
||||
# manifest.json's workspace_templates exactly. Closes the recurrence
|
||||
# path of PR #2556 (the data fix) and is the first concrete deliverable
|
||||
# of RFC #388 PR-3.
|
||||
#
|
||||
# Why a gate, not just discipline: PR #2536 pruned the manifest, but the
|
||||
# cascade list wasn't updated for ~weeks before someone (PR #2556)
|
||||
# noticed during an unrelated audit. During that window, codex never
|
||||
# rebuilt on a runtime publish. A structural gate catches the drift
|
||||
# the same day either file changes.
|
||||
#
|
||||
# Triggers narrowly to keep CI quiet: only on PRs that actually change
|
||||
# one of the two files. The path-filtered split + always-emit-result
|
||||
# pattern (memory: "Required check names need a job that always runs")
|
||||
# is unnecessary here because the workflow IS the check name and PR
|
||||
# branch protection should require it directly. Future-proof: if this
|
||||
# becomes a required check, add a no-op aggregator with always() so the
|
||||
# name still emits when paths don't match.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [staging, main]
|
||||
paths:
|
||||
- manifest.json
|
||||
- .github/workflows/publish-runtime.yml
|
||||
- scripts/check-cascade-list-vs-manifest.sh
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
check:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
- name: Check cascade list matches manifest
|
||||
run: bash scripts/check-cascade-list-vs-manifest.sh
|
||||
@@ -1,58 +0,0 @@
|
||||
name: Check migration collisions
|
||||
|
||||
# Hard gate (#2341): fails a PR that adds a migration prefix already
|
||||
# claimed by the base branch or another open PR. Caught manually 2026-04-30
|
||||
# during PR #2276 rebase: 044_runtime_image_pins collided with
|
||||
# 044_platform_inbound_secret from RFC #2312. This workflow makes that
|
||||
# check automatic.
|
||||
#
|
||||
# Trigger model: pull_request only — there's no value running this on
|
||||
# pushes to staging or main (those are post-merge; the gate must fire
|
||||
# pre-merge to be useful). Path filter scopes to PRs that actually touch
|
||||
# migrations.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths:
|
||||
- 'workspace-server/migrations/**'
|
||||
- 'scripts/ops/check_migration_collisions.py'
|
||||
- '.github/workflows/check-migration-collisions.yml'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
# gh pr list/diff need read access to other PRs
|
||||
pull-requests: read
|
||||
|
||||
jobs:
|
||||
check:
|
||||
name: Migration version collision check
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
# Need history to diff against base ref
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Detect collisions
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
BASE_REF: origin/${{ github.event.pull_request.base.ref }}
|
||||
HEAD_REF: ${{ github.event.pull_request.head.sha }}
|
||||
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||
# gh CLI uses GH_TOKEN from env
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Ensure the named base ref exists locally. checkout@v4 with
|
||||
# fetch-depth=0 pulls full history, but the explicit fetch is
|
||||
# cheap insurance against form-of-ref differences across runs.
|
||||
#
|
||||
# IMPORTANT: do NOT pass --depth=1 here. The script below uses
|
||||
# `git diff origin/<base>...<head>` (three-dot, merge-base form),
|
||||
# which fails with "fatal: no merge base" if the base ref is
|
||||
# shallow. The auto-promote staging→main PR (#2361) was blocked
|
||||
# by exactly this for ~5h on 2026-04-30 — the depth=1 fetch
|
||||
# overwrote checkout@v4's full-history clone with a shallow tip.
|
||||
git fetch origin "${{ github.event.pull_request.base.ref }}" || true
|
||||
python3 scripts/ops/check_migration_collisions.py
|
||||
@@ -1,442 +0,0 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
# GitHub merge queue fires `merge_group` for the queue's pre-merge CI run.
|
||||
# Required so the queue gets a real check result instead of a false-green
|
||||
# from the absence of a triggered workflow. Safe to add unconditionally —
|
||||
# the event simply doesn't fire until the queue is enabled on the branch.
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
# Cancel in-progress CI runs when a new commit arrives on the same ref.
|
||||
# This prevents stale runs from queuing behind each other. The merge_group
|
||||
# refs (refs/heads/gh-readonly-queue/...) get their own concurrency group
|
||||
# automatically because github.ref differs from the PR ref.
|
||||
concurrency:
|
||||
group: ci-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
# Detect which paths changed so downstream jobs can skip when only
|
||||
# docs/markdown files were modified.
|
||||
changes:
|
||||
name: Detect changes
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
platform: ${{ steps.check.outputs.platform }}
|
||||
canvas: ${{ steps.check.outputs.canvas }}
|
||||
python: ${{ steps.check.outputs.python }}
|
||||
scripts: ${{ steps.check.outputs.scripts }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- id: check
|
||||
run: |
|
||||
# For PR events: diff against the base branch (not HEAD~1 of the branch,
|
||||
# which may be unrelated after force-pushes). When a push updates a PR,
|
||||
# both pull_request and push events fire — prefer the PR base so that
|
||||
# the diff is always computed against the actual merge base, not the
|
||||
# previous SHA on the branch which may be on a different history line.
|
||||
BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
|
||||
# GITHUB_BASE_REF is set by GitHub for PR events (the base branch name).
|
||||
# For pull_request events we use the stored base.sha; for push events
|
||||
# (or when base.sha is unavailable) fall back to github.event.before.
|
||||
if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
|
||||
BASE="${{ github.event.pull_request.base.sha }}"
|
||||
fi
|
||||
# Fallback: if BASE is empty or all zeros (new branch), run everything
|
||||
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
|
||||
echo "platform=true" >> "$GITHUB_OUTPUT"
|
||||
echo "canvas=true" >> "$GITHUB_OUTPUT"
|
||||
echo "python=true" >> "$GITHUB_OUTPUT"
|
||||
echo "scripts=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
DIFF=$(git diff --name-only "$BASE" HEAD 2>/dev/null || echo ".github/workflows/ci.yml")
|
||||
echo "platform=$(echo "$DIFF" | grep -qE '^workspace-server/|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
|
||||
echo "canvas=$(echo "$DIFF" | grep -qE '^canvas/|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
|
||||
echo "python=$(echo "$DIFF" | grep -qE '^workspace/|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
|
||||
echo "scripts=$(echo "$DIFF" | grep -qE '^tests/e2e/|^scripts/|^infra/scripts/|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Platform (Go) is a required check on staging. Always-run + per-step
|
||||
# gating (see Canvas (Next.js) for the rationale and the failure mode
|
||||
# this avoids).
|
||||
platform-build:
|
||||
name: Platform (Go)
|
||||
needs: changes
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
steps:
|
||||
- if: needs.changes.outputs.platform != 'true'
|
||||
working-directory: .
|
||||
run: echo "No platform/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
with:
|
||||
go-version: 'stable'
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
run: go mod download
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
run: go build ./cmd/server
|
||||
# CLI (molecli) moved to standalone repo: github.com/molecule-ai/molecule-cli
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
run: go vet ./... || true
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
name: Run golangci-lint
|
||||
run: golangci-lint run --timeout 3m ./... || true
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
name: Run tests with race detection and coverage
|
||||
run: go test -race -coverprofile=coverage.out ./...
|
||||
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
name: Per-file coverage report
|
||||
# Advisory — lists every source file with its coverage so reviewers
|
||||
# can see at-a-glance where gaps are. Sorted ascending so the worst
|
||||
# offenders float to the top. Does NOT fail the build; the hard
|
||||
# gate is the threshold check below. (#1823)
|
||||
run: |
|
||||
echo "=== Per-file coverage (worst first) ==="
|
||||
go tool cover -func=coverage.out \
|
||||
| grep -v '^total:' \
|
||||
| awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
|
||||
END {for (f in s) printf "%6.1f%% %s\n", s[f]/c[f], f}' \
|
||||
| sort -n
|
||||
|
||||
- if: needs.changes.outputs.platform == 'true'
|
||||
name: Check coverage thresholds
|
||||
# Enforces two gates from #1823 Layer 1:
|
||||
# 1. Total floor (25% — ratchet plan in COVERAGE_FLOOR.md).
|
||||
# 2. Per-file floor — non-test .go files in security-critical
|
||||
# paths with coverage <10% fail the build, UNLESS the file
|
||||
# path is listed in .coverage-allowlist.txt (acknowledged
|
||||
# historical debt with a tracking issue + expiry).
|
||||
run: |
|
||||
set -e
|
||||
TOTAL_FLOOR=25
|
||||
# Security-critical paths where a 0%-coverage file is a real risk.
|
||||
CRITICAL_PATHS=(
|
||||
"internal/handlers/tokens"
|
||||
"internal/handlers/workspace_provision"
|
||||
"internal/handlers/a2a_proxy"
|
||||
"internal/handlers/registry"
|
||||
"internal/handlers/secrets"
|
||||
"internal/middleware/wsauth"
|
||||
"internal/crypto"
|
||||
)
|
||||
|
||||
TOTAL=$(go tool cover -func=coverage.out | grep '^total:' | awk '{print $3}' | sed 's/%//')
|
||||
echo "Total coverage: ${TOTAL}%"
|
||||
if awk "BEGIN{exit !($TOTAL < $TOTAL_FLOOR)}"; then
|
||||
echo "::error::Total coverage ${TOTAL}% is below the ${TOTAL_FLOOR}% floor. See COVERAGE_FLOOR.md for ratchet plan."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Aggregate per-file coverage → /tmp/perfile.txt: "<fullpath> <pct>"
|
||||
go tool cover -func=coverage.out \
|
||||
| grep -v '^total:' \
|
||||
| awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
|
||||
END {for (f in s) printf "%s %.1f\n", f, s[f]/c[f]}' \
|
||||
> /tmp/perfile.txt
|
||||
|
||||
# Build allowlist — paths relative to workspace-server, one per line.
|
||||
# Lines starting with # are comments.
|
||||
ALLOWLIST=""
|
||||
if [ -f ../.coverage-allowlist.txt ]; then
|
||||
ALLOWLIST=$(grep -vE '^(#|[[:space:]]*$)' ../.coverage-allowlist.txt || true)
|
||||
fi
|
||||
|
||||
FAILED=0
|
||||
WARNED=0
|
||||
for path in "${CRITICAL_PATHS[@]}"; do
|
||||
while read -r file pct; do
|
||||
[[ "$file" == *_test.go ]] && continue
|
||||
[[ "$file" == *"$path"* ]] || continue
|
||||
awk "BEGIN{exit !($pct < 10)}" || continue
|
||||
|
||||
# Strip the package-import prefix so we can match .coverage-allowlist.txt
|
||||
# entries written as paths relative to workspace-server/.
|
||||
# Handle both module paths: platform/workspace-server/... and platform/...
|
||||
rel=$(echo "$file" | sed 's|^github.com/molecule-ai/molecule-monorepo/platform/workspace-server/||; s|^github.com/molecule-ai/molecule-monorepo/platform/||')
|
||||
|
||||
if echo "$ALLOWLIST" | grep -qxF "$rel"; then
|
||||
echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry."
|
||||
WARNED=$((WARNED+1))
|
||||
else
|
||||
echo "::error file=workspace-server/$rel::Critical file at ${pct}% coverage — must be >=10% (target 80%). See #1823. To acknowledge as known debt, add this path to .coverage-allowlist.txt."
|
||||
FAILED=$((FAILED+1))
|
||||
fi
|
||||
done < /tmp/perfile.txt
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Critical-path check: $FAILED new failures, $WARNED allowlisted warnings."
|
||||
|
||||
if [ "$FAILED" -gt 0 ]; then
|
||||
echo ""
|
||||
echo "$FAILED security-critical file(s) have <10% test coverage and are"
|
||||
echo "NOT in the allowlist. These paths handle auth, tokens, secrets, or"
|
||||
echo "workspace provisioning — a 0% file here is the exact gap that let"
|
||||
echo "CWE-22, CWE-78, KI-005 slip through in past incidents. Either:"
|
||||
echo " (a) add tests to raise coverage above 10%, or"
|
||||
echo " (b) add the path to .coverage-allowlist.txt with an expiry date"
|
||||
echo " and a tracking issue reference."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Canvas (Next.js) — required check, always runs. See platform-build
|
||||
# comment above for the rationale.
|
||||
#
|
||||
# Supersedes the canvas-build-noop pattern attempted in PR #2321: two
|
||||
# jobs sharing `name:` doesn't actually satisfy branch protection
|
||||
# because the SKIPPED check run sibling is treated as not-passed
|
||||
# regardless of how many SUCCESS siblings it has. Verified empirically
|
||||
# on PR #2314 — mergeStateStatus stayed BLOCKED until I collapsed to
|
||||
# a single-job-with-conditional-steps shape.
|
||||
canvas-build:
|
||||
name: Canvas (Next.js)
|
||||
needs: changes
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: canvas
|
||||
steps:
|
||||
- if: needs.changes.outputs.canvas != 'true'
|
||||
working-directory: .
|
||||
run: echo "No canvas/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
|
||||
- if: needs.changes.outputs.canvas == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- if: needs.changes.outputs.canvas == 'true'
|
||||
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
|
||||
with:
|
||||
node-version: '22'
|
||||
- if: needs.changes.outputs.canvas == 'true'
|
||||
run: rm -f package-lock.json && npm install
|
||||
- if: needs.changes.outputs.canvas == 'true'
|
||||
run: npm run build
|
||||
- if: needs.changes.outputs.canvas == 'true'
|
||||
name: Run tests with coverage
|
||||
# Coverage instrumentation is configured in canvas/vitest.config.ts
|
||||
# (provider: v8, reporters: text + html + json-summary). Step 2 of
|
||||
# #1815 — wires coverage into CI so we get a baseline visible on
|
||||
# every PR. No threshold gate yet; thresholds dial in (Step 3, also
|
||||
# tracked in #1815) after the team sees what current coverage is.
|
||||
# Per the inline comment in vitest.config.ts: "first land
|
||||
# observability so we can see the baseline, then dial in
|
||||
# thresholds + a hard gate" — this PR ships the observability half.
|
||||
run: npx vitest run --coverage
|
||||
- name: Upload coverage summary as artifact
|
||||
if: needs.changes.outputs.canvas == 'true' && always()
|
||||
# Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
|
||||
# the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
|
||||
# implement, surfacing as `GHESNotSupportedError: @actions/artifact
|
||||
# v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not
|
||||
# currently supported on GHES`. Drop this pin when Gitea ships
|
||||
# the v4 protocol (tracked: post-Gitea-1.23 followup).
|
||||
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
|
||||
with:
|
||||
name: canvas-coverage-${{ github.run_id }}
|
||||
path: canvas/coverage/
|
||||
retention-days: 7
|
||||
if-no-files-found: warn
|
||||
|
||||
# MCP Server + SDK removed from CI — now in standalone repos:
|
||||
# - github.com/molecule-ai/molecule-mcp-server (npm CI)
|
||||
# - github.com/molecule-ai/molecule-sdk-python (PyPI CI)
|
||||
|
||||
# e2e-api job moved to .github/workflows/e2e-api.yml (issue #458).
|
||||
# It now has workflow-level concurrency (cancel-in-progress: false) so
|
||||
# new pushes queue the E2E run rather than cancelling it at the run level.
|
||||
|
||||
# Shellcheck (E2E scripts) — required check, always runs. See
|
||||
# platform-build for the rationale.
|
||||
shellcheck:
|
||||
name: Shellcheck (E2E scripts)
|
||||
needs: changes
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- if: needs.changes.outputs.scripts != 'true'
|
||||
run: echo "No tests/e2e/ or infra/scripts/ changes — skipping real shellcheck; this job always runs to satisfy the required-check name on branch protection."
|
||||
- if: needs.changes.outputs.scripts == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- if: needs.changes.outputs.scripts == 'true'
|
||||
name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh
|
||||
# shellcheck is pre-installed on ubuntu-latest runners (via apt).
|
||||
# infra/scripts/ is included because setup.sh + nuke.sh gate the
|
||||
# README quickstart — a shellcheck regression there silently breaks
|
||||
# new-user onboarding. scripts/ is intentionally excluded until its
|
||||
# pre-existing SC3040/SC3043 warnings are cleaned up.
|
||||
run: |
|
||||
find tests/e2e infra/scripts -type f -name '*.sh' -print0 \
|
||||
| xargs -0 shellcheck --severity=warning
|
||||
|
||||
- if: needs.changes.outputs.scripts == 'true'
|
||||
name: Lint cleanup-trap hygiene (RFC #2873)
|
||||
# Asserts every shell E2E test that calls `mktemp` also installs
|
||||
# an EXIT trap. Catches the /tmp-leak class — a missing trap
|
||||
# silently leaks scratch into CI runners (~10-100KB per run).
|
||||
# See tests/e2e/lint_cleanup_traps.sh for the rule + fix pattern.
|
||||
run: bash tests/e2e/lint_cleanup_traps.sh
|
||||
|
||||
- if: needs.changes.outputs.scripts == 'true'
|
||||
name: Run E2E bash unit tests (no live infra)
|
||||
# Pure-bash unit tests for E2E helper libs (lib/*.sh). These pin
|
||||
# behavior of dispatch logic that — when broken — silently masks as
|
||||
# "Could not resolve authentication method" only after a successful
|
||||
# tenant + workspace provision (PR #2571 incident, 2026-05-03). Add
|
||||
# new self-contained unit tests here as the lib/ directory grows;
|
||||
# tests requiring live CP/tenant credentials belong in the dedicated
|
||||
# e2e-staging-* workflows, not this job.
|
||||
run: |
|
||||
bash tests/e2e/test_model_slug.sh
|
||||
|
||||
canvas-deploy-reminder:
|
||||
name: Canvas Deploy Reminder
|
||||
runs-on: docker-host
|
||||
needs: [changes, canvas-build]
|
||||
# Only fires on direct pushes to main (i.e. after staging→main promotion).
|
||||
if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
steps:
|
||||
- name: Write deploy reminder to step summary
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
# Write body to a temp file — avoids backtick escaping in shell.
|
||||
cat > /tmp/deploy-reminder.md << 'BODY'
|
||||
## Canvas build passed ✅ — deploy required
|
||||
|
||||
The `publish-canvas-image` workflow is now building a fresh Docker image
|
||||
(`ghcr.io/molecule-ai/canvas:latest`) in the background.
|
||||
|
||||
Once it completes (~3–5 min), apply on the host machine with:
|
||||
```bash
|
||||
cd <runner-workspace>
|
||||
git pull origin main
|
||||
docker compose pull canvas && docker compose up -d canvas
|
||||
```
|
||||
|
||||
If you need to rebuild from local source instead (e.g. testing unreleased
|
||||
changes or a new `NEXT_PUBLIC_*` URL), use:
|
||||
```bash
|
||||
docker compose build canvas && docker compose up -d canvas
|
||||
```
|
||||
BODY
|
||||
printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
|
||||
"$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
|
||||
|
||||
# Gitea has no commit-comments API (no equivalent of
|
||||
# POST /repos/{owner}/{repo}/commits/{commit_sha}/comments).
|
||||
# Write to GITHUB_STEP_SUMMARY instead — both GitHub Actions and
|
||||
# Gitea Actions render this as the workflow run's summary page,
|
||||
# which is where operators look for post-deploy action items.
|
||||
# (#75 / PR-D)
|
||||
cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
# Python Lint & Test — required check, always runs. See platform-build
|
||||
# for the rationale.
|
||||
python-lint:
|
||||
name: Python Lint & Test
|
||||
needs: changes
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
WORKSPACE_ID: test
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace
|
||||
steps:
|
||||
- if: needs.changes.outputs.python != 'true'
|
||||
working-directory: .
|
||||
run: echo "No workspace/** changes — skipping real lint+test; this job always runs to satisfy the required-check name on branch protection."
|
||||
- if: needs.changes.outputs.python == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- if: needs.changes.outputs.python == 'true'
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: pip
|
||||
cache-dependency-path: workspace/requirements.txt
|
||||
- if: needs.changes.outputs.python == 'true'
|
||||
run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0
|
||||
# Coverage flags + fail-under floor moved into workspace/pytest.ini
|
||||
# (issue #1817) so local `pytest` and CI use identical config.
|
||||
- if: needs.changes.outputs.python == 'true'
|
||||
run: python -m pytest --tb=short
|
||||
|
||||
- if: needs.changes.outputs.python == 'true'
|
||||
name: Per-file critical-path coverage (MCP / inbox / auth)
|
||||
# MCP-critical Python files have a per-file floor on top of the
|
||||
# 86% total floor in pytest.ini. Rationale (issue #2790, after
|
||||
# the PR #2766 → PR #2771 cycle): the total floor averages ~6000
|
||||
# lines, so a single MCP file could regress to ~50% with no
|
||||
# complaint as long as other modules compensate. These five
|
||||
# files handle multi-tenant routing + auth + inbox dispatch —
|
||||
# a coverage drop here is the same risk shape as a Go-side
|
||||
# workspace-server token/secrets file dropping below 10%.
|
||||
#
|
||||
# Floor 75% sits below current actuals (80-96%) so this gate is
|
||||
# strictly additive — no existing PR fails. Ratchet plan in
|
||||
# COVERAGE_FLOOR.md.
|
||||
run: |
|
||||
set -e
|
||||
PER_FILE_FLOOR=75
|
||||
CRITICAL_FILES=(
|
||||
"a2a_mcp_server.py"
|
||||
"mcp_cli.py"
|
||||
"a2a_tools.py"
|
||||
"a2a_tools_inbox.py"
|
||||
"inbox.py"
|
||||
"platform_auth.py"
|
||||
)
|
||||
|
||||
# pytest already wrote .coverage; emit a JSON view scoped to
|
||||
# the critical files so jq/python can read the per-file pct
|
||||
# without parsing tabular text. --include uses fnmatch, and
|
||||
# the leading "*" allows the file to live anywhere under the
|
||||
# workspace root (today they sit at workspace/<name>.py).
|
||||
INCLUDES=$(printf '*%s,' "${CRITICAL_FILES[@]}")
|
||||
INCLUDES="${INCLUDES%,}"
|
||||
python -m coverage json -o /tmp/critical-cov.json --include="$INCLUDES"
|
||||
|
||||
FAILED=0
|
||||
for f in "${CRITICAL_FILES[@]}"; do
|
||||
# Match by top-level path key (e.g. "a2a_tools.py", not
|
||||
# "builtin_tools/a2a_tools.py" — different file at 100%).
|
||||
# The keys in coverage.json are paths relative to the run
|
||||
# cwd (workspace/), so the critical-path entry sits at the
|
||||
# bare basename.
|
||||
pct=$(jq -r --arg f "$f" '.files | to_entries | map(select(.key == $f)) | .[0].value.summary.percent_covered // "MISSING"' /tmp/critical-cov.json)
|
||||
if [ "$pct" = "MISSING" ]; then
|
||||
echo "::error file=workspace/$f::No coverage data — file may have moved or test exclusion mis-set."
|
||||
FAILED=$((FAILED+1))
|
||||
continue
|
||||
fi
|
||||
echo "$f: ${pct}%"
|
||||
if awk "BEGIN{exit !($pct < $PER_FILE_FLOOR)}"; then
|
||||
echo "::error file=workspace/$f::${pct}% < ${PER_FILE_FLOOR}% per-file floor (MCP critical path). See COVERAGE_FLOOR.md."
|
||||
FAILED=$((FAILED+1))
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$FAILED" -gt 0 ]; then
|
||||
echo ""
|
||||
echo "$FAILED MCP critical-path file(s) below the ${PER_FILE_FLOOR}% per-file floor."
|
||||
echo "These paths handle multi-tenant routing, auth tokens, and inbox dispatch."
|
||||
echo "A coverage drop here is the same risk shape as Go-side tokens/secrets files"
|
||||
echo "dropping below 10% (see COVERAGE_FLOOR.md). Either:"
|
||||
echo " (a) add tests to raise coverage back above ${PER_FILE_FLOOR}%, or"
|
||||
echo " (b) if this is unavoidable historical debt, file an issue and propose"
|
||||
echo " adjusting the floor with rationale in COVERAGE_FLOOR.md."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# SDK + plugin validation moved to standalone repo:
|
||||
# github.com/molecule-ai/molecule-sdk-python
|
||||
@@ -1,257 +0,0 @@
|
||||
name: Continuous synthetic E2E (staging)
|
||||
|
||||
# Hard gate (#2342): cron-driven full-lifecycle E2E that catches
|
||||
# regressions visible only at runtime — schema drift, deployment-pipeline
|
||||
# gaps, vendor outages, env-var rotations, DNS / CF / Railway side-effects.
|
||||
#
|
||||
# Why this gate exists:
|
||||
# PR-time CI catches code-level regressions but not deployment-time or
|
||||
# integration-time ones. Today's empirical data:
|
||||
# • #2345 (A2A v0.2 silent drop) — passed all unit tests, broke at
|
||||
# JSON-RPC parse layer between sender and receiver. Visible only
|
||||
# to a sender exercising the full path.
|
||||
# • RFC #2312 chat upload — landed on staging-branch but never
|
||||
# reached staging tenants because publish-workspace-server-image
|
||||
# was main-only. Caught by manual dogfooding hours after deploy.
|
||||
# Both would have surfaced within 15-20 min of regression if a
|
||||
# continuous synth-E2E was running.
|
||||
#
|
||||
# Cadence: every 20 min (3x/hour). The script is conservatively
|
||||
# bounded at 10 min wall-clock; even on degraded staging it should
|
||||
# finish before the next firing. cron-overlap is guarded by the
|
||||
# concurrency group below.
|
||||
#
|
||||
# Cost: ~3 runs/hour × 5-10 min × $0.008/min GHA = ~$0.50-$1/day.
|
||||
# Plus a fresh tenant provisioned + torn down each run (Railway +
|
||||
# AWS pennies). Negligible.
|
||||
#
|
||||
# Failure handling: when the run fails, the workflow exits non-zero
|
||||
# and GitHub's standard email/notification path fires. Operators
|
||||
# can subscribe to this workflow's failure channel for paging-grade
|
||||
# alerting.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every 10 minutes, on :02 :12 :22 :32 :42 :52. Three constraints:
|
||||
# 1. Stay off the top-of-hour. GitHub Actions scheduler drops
|
||||
# :00 firings under high load (own docs:
|
||||
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule).
|
||||
# Prior history: cron was '0,20,40' (2026-05-02) — only :00
|
||||
# ever survived. Bumped to '10,30,50' (2026-05-03) on the
|
||||
# theory that further-from-:00 wins. Empirically 2026-05-04
|
||||
# that ALSO dropped to ~60 min effective cadence (only ~1
|
||||
# schedule fire per hour — see molecule-core#2726). Detection
|
||||
# latency was claimed 20 min, actual 60 min.
|
||||
# 2. Avoid colliding with the existing :15 sweep-cf-orphans
|
||||
# and :45 sweep-cf-tunnels — both hit the CF API and we
|
||||
# don't want to fight for rate-limit tokens.
|
||||
# 3. Avoid the :30 heavy slot (canary-staging /30, sweep-aws-
|
||||
# secrets, sweep-stale-e2e-orgs every :15) — multiple
|
||||
# overlapping cron registrations on the same minute is part
|
||||
# of what GH drops under load.
|
||||
# Solution: bump fires-per-hour 3 → 6 AND keep all slots in clean
|
||||
# lanes (1-3 min away from any other cron). Even with empirically-
|
||||
# observed ~67% GH drop ratio, 6 attempts/hour yields ~2 effective
|
||||
# fires = ~30 min cadence; closer to the 20-min target than the
|
||||
# current shape and provides a real degradation alarm if drops
|
||||
# get worse.
|
||||
- cron: '2,12,22,32,42,52 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
runtime:
|
||||
description: "Runtime to provision (claude-code = default + cheapest via MiniMax; langgraph = OpenAI-only; hermes = SDK-native path, slower)"
|
||||
required: false
|
||||
default: "claude-code"
|
||||
type: string
|
||||
model_slug:
|
||||
description: "Model id to provision the workspace with (default MiniMax-M2.7-highspeed; e.g. 'sonnet' to test direct Anthropic, 'openai/gpt-4o' for hermes)"
|
||||
required: false
|
||||
default: "MiniMax-M2.7-highspeed"
|
||||
type: string
|
||||
keep_org:
|
||||
description: "Skip teardown for post-mortem debugging (only manual dispatch — never set this for cron runs)"
|
||||
required: false
|
||||
default: false
|
||||
type: boolean
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
# No issue-write here — failures surface as red runs in the workflow
|
||||
# history. If you want auto-issue-on-fail, add a follow-up step that
|
||||
# uses gh issue create gated on `if: failure()`. Keeping the surface
|
||||
# minimal until that's actually wanted.
|
||||
|
||||
# Serialize so two firings can never overlap. Cron firing every 20 min
|
||||
# but scripts conservatively bounded at 10 min — overlap shouldn't
|
||||
# happen in steady state, but if a run hangs we don't want N more
|
||||
# stacking up.
|
||||
concurrency:
|
||||
group: continuous-synth-e2e
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
synth:
|
||||
name: Synthetic E2E against staging
|
||||
runs-on: ubuntu-latest
|
||||
# Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
|
||||
# (apt-get update + install docker.io/jq/awscli/caddy + snap install
|
||||
# ssm-agent) runs from raw Ubuntu on every boot — none of it is
|
||||
# pre-baked into the tenant AMI. Empirical fetch_secrets/ok timing
|
||||
# across today's canaries: 51s → 82s → 143s → 625s. apt-mirror tail
|
||||
# latency drives the boot-to-fetch_secrets phase from ~1min to >10min.
|
||||
# A 12min budget leaves only ~2min for the workspace (which needs
|
||||
# ~3.5min for claude-code cold boot) on slow-apt days, blowing the
|
||||
# budget. 20min absorbs the worst tenant tail so the workspace probe
|
||||
# gets the full ~7min it needs even on a slow apt day. Real fix:
|
||||
# pre-bake caddy + ssm-agent into the tenant AMI (controlplane#TBD).
|
||||
timeout-minutes: 20
|
||||
env:
|
||||
# claude-code default: cold-start ~5 min (comparable to langgraph),
|
||||
# but uses MiniMax-M2.7-highspeed via the template's third-party-
|
||||
# Anthropic-compat path (workspace-configs-templates/claude-code-
|
||||
# default/config.yaml:64-69). MiniMax is ~5-10x cheaper than
|
||||
# gpt-4.1-mini per token AND avoids the recurring OpenAI quota-
|
||||
# exhaustion class that took the canary down 2026-05-03 (#265).
|
||||
# Operators can pick langgraph / hermes via workflow_dispatch
|
||||
# when they specifically need to exercise the OpenAI or SDK-
|
||||
# native paths.
|
||||
E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
|
||||
# Pin the canary to a specific MiniMax model rather than relying
|
||||
# on the per-runtime default ("sonnet" → routes to direct
|
||||
# Anthropic, defeats the cost saving). Operators can override
|
||||
# via workflow_dispatch by setting a different E2E_MODEL_SLUG
|
||||
# input if they need to exercise a specific model. M2.7-highspeed
|
||||
# is "Token Plan only" but cheap-per-token and fast.
|
||||
E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7-highspeed' }}
|
||||
# Bound to 10 min so a stuck provision fails the run instead of
|
||||
# holding up the next cron firing. 15-min default in the script
|
||||
# is for the on-PR full lifecycle where we have more headroom.
|
||||
E2E_PROVISION_TIMEOUT_SECS: '600'
|
||||
# Slug suffix — namespaced "synth-" so these runs are
|
||||
# distinguishable from PR-driven runs in CP admin.
|
||||
E2E_RUN_ID: synth-${{ github.run_id }}
|
||||
# Forced false for cron; respected for manual dispatch
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
|
||||
MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
# MiniMax key is the canary's PRIMARY auth path. claude-code
|
||||
# template's `minimax` provider routes ANTHROPIC_BASE_URL to
|
||||
# api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot.
|
||||
# tests/e2e/test_staging_full_saas.sh branches SECRETS_JSON on
|
||||
# which key is present — MiniMax wins when set.
|
||||
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
|
||||
# Direct-Anthropic alternative for operators who don't want to
|
||||
# set up a MiniMax account (priority below MiniMax — first
|
||||
# non-empty wins in test_staging_full_saas.sh's secrets-injection
|
||||
# block). See #2578 PR comment for the rationale.
|
||||
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
|
||||
# OpenAI fallback — kept wired so operators can dispatch with
|
||||
# E2E_RUNTIME=langgraph or =hermes and still have a working
|
||||
# canary path. The script picks the right blob shape based on
|
||||
# which key is non-empty.
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify required secrets present
|
||||
run: |
|
||||
# Hard-fail on missing secret REGARDLESS of trigger. Previously
|
||||
# this step soft-skipped on workflow_dispatch via `exit 0`, but
|
||||
# `exit 0` only ends the STEP — subsequent steps still ran with
|
||||
# the empty secret, the synth script fell through to the wrong
|
||||
# SECRETS_JSON branch, and the canary failed 5 min later with a
|
||||
# confusing "Agent error (Exception)" instead of the clean
|
||||
# "secret missing" message at the top. Caught 2026-05-04 by
|
||||
# dispatched run 25296530706: claude-code + missing MINIMAX
|
||||
# silently used OpenAI keys but kept model=MiniMax-M2.7, then
|
||||
# the workspace 401'd against MiniMax once it tried to call.
|
||||
# Fix: exit 1 in both cron and dispatch paths. Operators who
|
||||
# want to verify a YAML change without setting up the secret
|
||||
# can read the verify-secrets step's stderr — the failure is
|
||||
# itself the verification signal.
|
||||
if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — synth E2E cannot run"
|
||||
echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# LLM-key requirement is per-runtime: claude-code accepts
|
||||
# EITHER MiniMax OR direct-Anthropic (whichever is set first),
|
||||
# langgraph + hermes use OpenAI (MOLECULE_STAGING_OPENAI_KEY).
|
||||
case "${E2E_RUNTIME}" in
|
||||
claude-code)
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
|
||||
required_secret_value="${E2E_MINIMAX_API_KEY}"
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
|
||||
required_secret_value="${E2E_ANTHROPIC_API_KEY}"
|
||||
else
|
||||
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
|
||||
required_secret_value=""
|
||||
fi
|
||||
;;
|
||||
langgraph|hermes)
|
||||
required_secret_name="MOLECULE_STAGING_OPENAI_KEY"
|
||||
required_secret_value="${E2E_OPENAI_API_KEY:-}"
|
||||
;;
|
||||
*)
|
||||
echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
|
||||
required_secret_name=""
|
||||
required_secret_value="present"
|
||||
;;
|
||||
esac
|
||||
if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
|
||||
echo "::error::${required_secret_name} secret missing — runtime=${E2E_RUNTIME} cannot authenticate against its LLM provider"
|
||||
echo "::error::Set it at Settings → Secrets and Variables → Actions, OR dispatch with a different runtime"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Install required tools
|
||||
run: |
|
||||
# The script depends on jq + curl (already on ubuntu-latest)
|
||||
# and python3 (likewise). Verify they're all present so we
|
||||
# fail fast on a runner image regression rather than mid-script.
|
||||
for cmd in jq curl python3; do
|
||||
command -v "$cmd" >/dev/null 2>&1 || {
|
||||
echo "::error::required tool '$cmd' not on PATH — runner image regression?"
|
||||
exit 1
|
||||
}
|
||||
done
|
||||
|
||||
- name: Run synthetic E2E
|
||||
# The script handles its own teardown via EXIT trap; even on
|
||||
# failure (timeout, assertion), the org is deprovisioned and
|
||||
# leaks are reported. Exit code propagates from the script.
|
||||
run: |
|
||||
bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
- name: Failure summary
|
||||
# Runs only on failure. Adds a job summary so the workflow run
|
||||
# page shows a quick "what happened" instead of forcing readers
|
||||
# to scroll through script output.
|
||||
if: failure()
|
||||
run: |
|
||||
{
|
||||
echo "## Continuous synth E2E failed"
|
||||
echo ""
|
||||
echo "**Run ID:** ${{ github.run_id }}"
|
||||
echo "**Trigger:** ${{ github.event_name }}"
|
||||
echo "**Runtime:** ${E2E_RUNTIME}"
|
||||
echo "**Slug:** synth-${{ github.run_id }}"
|
||||
echo ""
|
||||
echo "### What this means"
|
||||
echo ""
|
||||
echo "Staging just regressed on a path that previously worked. Likely classes:"
|
||||
echo "- Schema mismatch between sender and receiver (#2345 class)"
|
||||
echo "- Deployment-pipeline gap (RFC #2312 / staging-tenant-image-stale class)"
|
||||
echo "- Vendor outage (Cloudflare, Railway, AWS, GHCR)"
|
||||
echo "- Staging-CP env var rotation"
|
||||
echo ""
|
||||
echo "### Next steps"
|
||||
echo ""
|
||||
echo "1. Check the script output above for the assertion that failed"
|
||||
echo "2. If it's a vendor outage, no action needed — next firing in ~20 min"
|
||||
echo "3. If it's a code regression, find the causing PR via \`git log\` against last green run and revert/fix"
|
||||
echo "4. Keep an eye on the next 1-2 firings — flake vs persistent fail differs in priority"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
@@ -1,307 +0,0 @@
|
||||
name: E2E API Smoke Test
|
||||
# Extracted from ci.yml so workflow-level concurrency can protect this job
|
||||
# from run-level cancellation (issue #458).
|
||||
#
|
||||
# Trigger model (revised 2026-04-29):
|
||||
#
|
||||
# Always FIRES on push/pull_request to staging+main. Real work is gated
|
||||
# per-step on `needs.detect-changes.outputs.api` — when paths under
|
||||
# `workspace-server/`, `tests/e2e/`, or this workflow file haven't
|
||||
# changed, the no-op step alone runs and emits SUCCESS for the
|
||||
# `E2E API Smoke Test` check, satisfying branch protection without
|
||||
# spending CI cycles. See the in-job comment on the `e2e-api` job for
|
||||
# why this is one job (not two-jobs-sharing-name) and the 2026-04-29
|
||||
# PR #2264 incident that drove the consolidation.
|
||||
#
|
||||
# Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
|
||||
# -------------------------------------------------------------------
|
||||
# Same substrate hazard as PR #98 (handlers-postgres-integration). Our
|
||||
# Gitea act_runner runs with `container.network: host` (operator host
|
||||
# `/opt/molecule/runners/config.yaml`), which means:
|
||||
#
|
||||
# * Two concurrent runs both try to bind their `-p 15432:5432` /
|
||||
# `-p 16379:6379` host ports — the second postgres/redis FATALs
|
||||
# with `Address in use` and `docker run` returns exit 125 with
|
||||
# `Conflict. The container name "/molecule-ci-postgres" is already
|
||||
# in use by container ...`. Verified in run a7/2727 on 2026-05-07.
|
||||
# * The fixed container names `molecule-ci-postgres` / `-redis` (the
|
||||
# pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
|
||||
# `docker rm -f` at the start of the second job KILLS the first
|
||||
# job's still-running postgres/redis.
|
||||
#
|
||||
# Fix shape (mirrors PR #98's bridge-net pattern, adapted because
|
||||
# platform-server is a Go binary on the host, not a containerised
|
||||
# step):
|
||||
#
|
||||
# 1. Unique container names per run:
|
||||
# pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
|
||||
# redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
|
||||
# `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
|
||||
# same run_id.
|
||||
# 2. Ephemeral host port per run (`-p 0:5432`), then read the actual
|
||||
# bound port via `docker port` and export DATABASE_URL/REDIS_URL
|
||||
# pointing at it. No fixed host-port → no port collision.
|
||||
# 3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
|
||||
# the original flake fixed in #92 and the script's still IPv6-
|
||||
# enabled.
|
||||
# 4. `if: always()` cleanup so containers don't leak when test steps
|
||||
# fail.
|
||||
#
|
||||
# Issue #94 items #2 + #3 (also fixed here):
|
||||
# * Pre-pull `alpine:latest` so the platform-server's provisioner
|
||||
# (`internal/handlers/container_files.go`) can stand up its
|
||||
# ephemeral token-write helper without a daemon.io round-trip.
|
||||
# * Create `molecule-core-net` bridge network if missing so the
|
||||
# provisioner's container.HostConfig {NetworkMode: ...} attach
|
||||
# succeeds.
|
||||
# Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
|
||||
# 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
|
||||
# they DO come up. Timeouts are not the bottleneck; not bumped.
|
||||
#
|
||||
# Item explicitly NOT fixed here: failing test `Status back online`
|
||||
# fails because the platform's langgraph workspace template image
|
||||
# (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
|
||||
# 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
|
||||
# template-registry resolution issue (ADR-002 / local-build mode) and
|
||||
# belongs in a separate change that touches workspace-server, not
|
||||
# this workflow file.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
# Per-SHA grouping (changed 2026-04-28 from per-ref). Per-ref had the
|
||||
# same auto-promote-staging brittleness as e2e-staging-canvas — back-
|
||||
# to-back staging pushes share refs/heads/staging, so the older push's
|
||||
# queued run gets cancelled when a newer push lands. Auto-promote-
|
||||
# staging then sees `completed/cancelled` for the older SHA and stays
|
||||
# put; the newer SHA's gates may eventually save the day, but if the
|
||||
# newer push gets cancelled too, we deadlock.
|
||||
#
|
||||
# See e2e-staging-canvas.yml's identical concurrency block for the full
|
||||
# rationale and the 2026-04-28 incident reference.
|
||||
group: e2e-api-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
detect-changes:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
api: ${{ steps.decide.outputs.api }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
api:
|
||||
- 'workspace-server/**'
|
||||
- 'tests/e2e/**'
|
||||
- '.github/workflows/e2e-api.yml'
|
||||
- id: decide
|
||||
# Always run real work for manual dispatch — no diff context to
|
||||
# filter against and ops dispatching this expects the suite to
|
||||
# actually exercise the platform.
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "api=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "api=${{ steps.filter.outputs.api }}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# ONE job (no job-level `if:`) that always runs and reports under the
|
||||
# required-check name `E2E API Smoke Test`. Real work is gated per-step
|
||||
# on `needs.detect-changes.outputs.api`. Reason: GitHub registers a
|
||||
# check run for every job that matches `name:`, and a job-level
|
||||
# `if: false` produces a SKIPPED check run. Branch protection treats
|
||||
# all check runs with a matching context name on the latest commit as a
|
||||
# SET — any SKIPPED in the set fails the required-check eval, even with
|
||||
# SUCCESS siblings. Verified 2026-04-29 on PR #2264 (staging→main):
|
||||
# 4 check runs (2 SKIPPED + 2 SUCCESS) at the head SHA blocked
|
||||
# promotion despite all real work succeeding. Collapsing to a single
|
||||
# always-running job with conditional steps emits exactly one SUCCESS
|
||||
# check run regardless of paths filter — branch-protection-clean.
|
||||
e2e-api:
|
||||
needs: detect-changes
|
||||
name: E2E API Smoke Test
|
||||
runs-on: docker-host
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
# Unique per-run container names so concurrent runs on the host-
|
||||
# network act_runner don't collide on name OR port.
|
||||
# `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
|
||||
# same run_id. PORT is set later (after docker port lookup) since
|
||||
# we let Docker assign an ephemeral host port.
|
||||
PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
PORT: "8080"
|
||||
steps:
|
||||
- name: No-op pass (paths filter excluded this commit)
|
||||
if: needs.detect-changes.outputs.api != 'true'
|
||||
run: |
|
||||
echo "No workspace-server / tests/e2e / workflow changes — E2E API gate satisfied without running tests."
|
||||
echo "::notice::E2E API Smoke Test no-op pass (paths filter excluded this commit)."
|
||||
- if: needs.detect-changes.outputs.api == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- if: needs.detect-changes.outputs.api == 'true'
|
||||
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
with:
|
||||
go-version: 'stable'
|
||||
cache: true
|
||||
cache-dependency-path: workspace-server/go.sum
|
||||
- name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# Provisioner uses alpine:latest for ephemeral token-write
|
||||
# containers (workspace-server/internal/handlers/container_files.go).
|
||||
# Pre-pull so the first provision in test_api.sh doesn't race
|
||||
# the daemon's pull cache. Idempotent — `docker pull` is a no-op
|
||||
# when the image is already present.
|
||||
docker pull alpine:latest >/dev/null
|
||||
# Provisioner attaches workspace containers to
|
||||
# molecule-core-net (workspace-server/internal/provisioner/
|
||||
# provisioner.go::DefaultNetwork). The bridge already exists on
|
||||
# the operator host's docker daemon — `network create` is
|
||||
# idempotent via `|| true`.
|
||||
docker network create molecule-core-net >/dev/null 2>&1 || true
|
||||
echo "alpine:latest pre-pulled; molecule-core-net ensured."
|
||||
- name: Start Postgres (docker)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# Defensive cleanup — only matches THIS run's container name,
|
||||
# so it cannot kill a sibling run's postgres. (Pre-fix the
|
||||
# name was static and this rm hit other runs' containers.)
|
||||
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
# `-p 0:5432` requests an ephemeral host port; we read it back
|
||||
# below and export DATABASE_URL.
|
||||
docker run -d --name "$PG_CONTAINER" \
|
||||
-e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
|
||||
-p 0:5432 postgres:16 >/dev/null
|
||||
# Resolve the host-side port assignment. `docker port` prints
|
||||
# `0.0.0.0:NNNN` (and on host-net runners may also print an
|
||||
# IPv6 line — take the first IPv4 line).
|
||||
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
|
||||
if [ -z "$PG_PORT" ]; then
|
||||
# Fallback: any first line. Some Docker versions print only
|
||||
# one line.
|
||||
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
|
||||
fi
|
||||
if [ -z "$PG_PORT" ]; then
|
||||
echo "::error::Could not resolve host port for $PG_CONTAINER"
|
||||
docker port "$PG_CONTAINER" 5432/tcp || true
|
||||
docker logs "$PG_CONTAINER" || true
|
||||
exit 1
|
||||
fi
|
||||
# 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
|
||||
echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
|
||||
echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
|
||||
echo "Postgres host port: ${PG_PORT}"
|
||||
for i in $(seq 1 30); do
|
||||
if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
|
||||
echo "Postgres ready after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "::error::Postgres did not become ready in 30s"
|
||||
docker logs "$PG_CONTAINER" || true
|
||||
exit 1
|
||||
- name: Start Redis (docker)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
|
||||
docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
|
||||
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
|
||||
if [ -z "$REDIS_PORT" ]; then
|
||||
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
|
||||
fi
|
||||
if [ -z "$REDIS_PORT" ]; then
|
||||
echo "::error::Could not resolve host port for $REDIS_CONTAINER"
|
||||
docker port "$REDIS_CONTAINER" 6379/tcp || true
|
||||
docker logs "$REDIS_CONTAINER" || true
|
||||
exit 1
|
||||
fi
|
||||
echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
|
||||
echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
|
||||
echo "Redis host port: ${REDIS_PORT}"
|
||||
for i in $(seq 1 15); do
|
||||
if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
|
||||
echo "Redis ready after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "::error::Redis did not become ready in 15s"
|
||||
docker logs "$REDIS_CONTAINER" || true
|
||||
exit 1
|
||||
- name: Build platform
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
working-directory: workspace-server
|
||||
run: go build -o platform-server ./cmd/server
|
||||
- name: Start platform (background)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
# DATABASE_URL + REDIS_URL exported by the start-postgres /
|
||||
# start-redis steps point at this run's per-run host ports.
|
||||
./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
- name: Wait for /health
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
for i in $(seq 1 30); do
|
||||
if curl -sf http://127.0.0.1:8080/health > /dev/null; then
|
||||
echo "Platform up after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "::error::Platform did not become healthy in 30s"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
- name: Assert migrations applied
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'")
|
||||
if [ "$tables" != "1" ]; then
|
||||
echo "::error::Migrations did not apply"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Migrations OK"
|
||||
- name: Run E2E API tests
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_api.sh
|
||||
- name: Run notify-with-attachments E2E
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_notify_attachments_e2e.sh
|
||||
- name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_priority_runtimes_e2e.sh
|
||||
- name: Run poll-mode + since_id cursor E2E (#2339)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_poll_mode_e2e.sh
|
||||
- name: Run poll-mode chat upload E2E (RFC #2891)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_poll_mode_chat_upload_e2e.sh
|
||||
- name: Dump platform log on failure
|
||||
if: failure() && needs.detect-changes.outputs.api == 'true'
|
||||
run: cat workspace-server/platform.log || true
|
||||
- name: Stop platform
|
||||
if: always() && needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
if [ -f workspace-server/platform.pid ]; then
|
||||
kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
|
||||
fi
|
||||
- name: Stop service containers
|
||||
# always() so containers don't leak when test steps fail. The
|
||||
# cleanup is best-effort: if the container is already gone
|
||||
# (e.g. concurrent rerun race), don't fail the job.
|
||||
if: always() && needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
|
||||
@@ -1,216 +0,0 @@
|
||||
name: E2E Staging Canvas (Playwright)
|
||||
|
||||
# Playwright test suite that provisions a fresh staging org per run and
|
||||
# verifies every workspace-panel tab renders without crashing. Complements
|
||||
# e2e-staging-saas.yml (which tests the API shape) by exercising the
|
||||
# actual browser + canvas bundle against live staging.
|
||||
#
|
||||
# Triggers: push to main/staging or PR touching canvas sources + this workflow,
|
||||
# manual dispatch, and weekly cron to catch browser/runtime drift even
|
||||
# when canvas is quiet.
|
||||
# Added staging to push/pull_request branches so the auto-promote gate
|
||||
# check (--event push --branch staging) can see a completed run for this
|
||||
# workflow — mirrors what PR #1891 does for e2e-api.yml.
|
||||
|
||||
on:
|
||||
# Trigger model (revised 2026-04-29):
|
||||
#
|
||||
# Always fires on push/pull_request; real work is gated per-step on
|
||||
# `needs.detect-changes.outputs.canvas`. When canvas/ paths haven't
|
||||
# changed, the no-op step alone runs and emits SUCCESS for the
|
||||
# `Canvas tabs E2E` check, satisfying branch protection without
|
||||
# spending CI cycles. See e2e-api.yml for the rationale on why this
|
||||
# is a single job rather than two-jobs-sharing-name.
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
|
||||
# release-note-shaped regressions that don't ride in with a PR.
|
||||
- cron: '0 8 * * 0'
|
||||
|
||||
concurrency:
|
||||
# Per-SHA grouping (changed 2026-04-28 from a single global group). The
|
||||
# global group made auto-promote-staging brittle: when a staging push
|
||||
# queued behind an in-flight run and a third entrant (a PR run, a
|
||||
# follow-on push) entered the group, the staging push got cancelled —
|
||||
# leaving auto-promote-staging looking at `completed/cancelled` for a
|
||||
# required gate and refusing to advance main. Observed 2026-04-28
|
||||
# 23:51-23:53 on staging tip 3f99fede.
|
||||
#
|
||||
# The original intent of the global group was to throttle parallel
|
||||
# E2E provisions (each spins a fresh EC2). At our scale that throttle
|
||||
# isn't worth the correctness cost — fresh-org-per-run isolates the
|
||||
# state, and the cost of two parallel runs (~$0.001/min × 10min × 2)
|
||||
# is rounding error vs. the cost of a stuck pipeline.
|
||||
#
|
||||
# Per-SHA still dedupes accidental double-triggers for the SAME SHA.
|
||||
# It does NOT cancel obsolete-PR-version runs on force-push; that
|
||||
# wasted CI is acceptable given the alternative is losing staging-tip
|
||||
# data that auto-promote-staging needs.
|
||||
group: e2e-staging-canvas-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
detect-changes:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
canvas: ${{ steps.decide.outputs.canvas }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
canvas:
|
||||
- 'canvas/**'
|
||||
- '.github/workflows/e2e-staging-canvas.yml'
|
||||
- id: decide
|
||||
# Always run real tests for manual dispatch and the weekly cron —
|
||||
# both exist precisely to exercise the suite, regardless of diff.
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "${{ github.event_name }}" = "schedule" ]; then
|
||||
echo "canvas=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "canvas=${{ steps.filter.outputs.canvas }}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# ONE job (no job-level `if:`) that always runs and reports under the
|
||||
# required-check name `Canvas tabs E2E`. Real work is gated per-step on
|
||||
# `needs.detect-changes.outputs.canvas`. See e2e-api.yml for the full
|
||||
# rationale — same path-filter check-name parity issue blocked PR #2264
|
||||
# (staging→main) on 2026-04-29 because branch protection treats matching-
|
||||
# name check runs as a SET, and any SKIPPED member fails the eval.
|
||||
playwright:
|
||||
needs: detect-changes
|
||||
name: Canvas tabs E2E
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 40
|
||||
|
||||
env:
|
||||
CANVAS_E2E_STAGING: '1'
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
working-directory: canvas
|
||||
|
||||
steps:
|
||||
- name: No-op pass (paths filter excluded this commit)
|
||||
if: needs.detect-changes.outputs.canvas != 'true'
|
||||
working-directory: .
|
||||
run: |
|
||||
echo "No canvas / workflow changes — E2E Staging Canvas gate satisfied without running tests."
|
||||
echo "::notice::E2E Staging Canvas no-op pass (paths filter excluded this commit)."
|
||||
|
||||
- if: needs.detect-changes.outputs.canvas == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify admin token present
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::error::Missing MOLECULE_STAGING_ADMIN_TOKEN"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Set up Node
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
|
||||
with:
|
||||
node-version: '20'
|
||||
cache: 'npm'
|
||||
cache-dependency-path: canvas/package-lock.json
|
||||
|
||||
- name: Install canvas deps
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
run: npm ci
|
||||
|
||||
- name: Install Playwright browsers
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
timeout-minutes: 10
|
||||
run: npx playwright install --with-deps chromium
|
||||
|
||||
- name: Run staging canvas E2E
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
run: npx playwright test --config=playwright.staging.config.ts
|
||||
|
||||
- name: Upload Playwright report on failure
|
||||
if: failure() && needs.detect-changes.outputs.canvas == 'true'
|
||||
# Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
|
||||
# the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
|
||||
# implement (see ci.yml upload step for the canonical error
|
||||
# cite). Drop this pin when Gitea ships the v4 protocol.
|
||||
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
|
||||
with:
|
||||
name: playwright-report-staging
|
||||
path: canvas/playwright-report-staging/
|
||||
retention-days: 14
|
||||
|
||||
- name: Upload screenshots on failure
|
||||
if: failure() && needs.detect-changes.outputs.canvas == 'true'
|
||||
# Pinned to v3 for Gitea act_runner v0.6 compatibility (see above).
|
||||
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
|
||||
with:
|
||||
name: playwright-screenshots
|
||||
path: canvas/test-results/
|
||||
retention-days: 14
|
||||
|
||||
# Safety-net teardown — fires only when Playwright's globalTeardown
|
||||
# didn't (worker crash, runner cancel). Reads the slug from
|
||||
# canvas/.playwright-staging-state.json (written by staging-setup
|
||||
# as its first action, before any CP call) and deletes only that
|
||||
# slug.
|
||||
#
|
||||
# Earlier versions of this step pattern-swept `e2e-canvas-<today>-*`
|
||||
# orgs to compensate for setup-crash-before-state-file-write. That
|
||||
# over-aggressive cleanup raced concurrent canvas-E2E runs and
|
||||
# poisoned each other's tenants — observed 2026-04-30 when three
|
||||
# real-test runs killed each other mid-test, surfacing as
|
||||
# `getaddrinfo ENOTFOUND` once CP had cleaned up the just-deleted
|
||||
# DNS record. Pattern-sweep removed; setup now writes the state
|
||||
# file before any CP work, so the slug is always recoverable.
|
||||
- name: Teardown safety net
|
||||
if: always() && needs.detect-changes.outputs.canvas == 'true'
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
STATE_FILE=".playwright-staging-state.json"
|
||||
if [ ! -f "$STATE_FILE" ]; then
|
||||
echo "::notice::No state file at canvas/$STATE_FILE — Playwright globalTeardown handled it (or setup never ran)."
|
||||
exit 0
|
||||
fi
|
||||
slug=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('slug',''))")
|
||||
if [ -z "$slug" ]; then
|
||||
echo "::warning::State file present but slug missing; nothing to clean up."
|
||||
exit 0
|
||||
fi
|
||||
echo "Deleting orphan tenant: $slug"
|
||||
# Verify HTTP 2xx instead of `>/dev/null || true` swallowing
|
||||
# failures. A 5xx or timeout previously looked identical to
|
||||
# success, leaving the tenant alive for up to ~45 min until
|
||||
# sweep-stale-e2e-orgs caught it. Surface failures as
|
||||
# workflow warnings naming the slug. Don't `exit 1` — a single
|
||||
# cleanup miss shouldn't fail-flag the canvas test when the
|
||||
# actual smoke check passed; the sweeper is the safety net.
|
||||
# See molecule-controlplane#420.
|
||||
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
|
||||
# pollution of the captured status (lint-curl-status-capture.yml).
|
||||
set +e
|
||||
curl -sS -o /tmp/canvas-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/canvas-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/canvas-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::canvas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canvas-cleanup.out 2>/dev/null)"
|
||||
fi
|
||||
exit 0
|
||||
@@ -1,184 +0,0 @@
|
||||
name: E2E Staging External Runtime
|
||||
|
||||
# Regression for the four/five workspaces.status=awaiting_agent transitions
|
||||
# that silently failed in production for five days before migration 046
|
||||
# extended the workspace_status enum (see
|
||||
# workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql).
|
||||
#
|
||||
# Why this is its own workflow (not folded into e2e-staging-saas.yml):
|
||||
# - The full-saas harness defaults to runtime=hermes, never exercises
|
||||
# external-runtime. Adding an `external` parameter to that script
|
||||
# would force every push to staging through both lifecycles in
|
||||
# series, doubling the EC2 cold-start budget.
|
||||
# - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER
|
||||
# window, 90s default + sweep interval), which we wait through
|
||||
# deliberately. Folding it into hermes would make the long path
|
||||
# even longer.
|
||||
# - It can run in parallel with the hermes E2E since both create
|
||||
# fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs
|
||||
# `e2e-...`).
|
||||
#
|
||||
# Triggers:
|
||||
# - Push to staging when any source affecting external runtime,
|
||||
# hibernation, or the migration set changes.
|
||||
# - PR review for the same set.
|
||||
# - Manual workflow_dispatch.
|
||||
# - Daily cron at 07:30 UTC (catches drift on quiet days; staggered
|
||||
# 30 min after e2e-staging-saas.yml's 07:00 UTC cron).
|
||||
#
|
||||
# Concurrency: serialized so two staging pushes don't fight for the
|
||||
# same EC2 quota window. cancel-in-progress=false so a half-rolled
|
||||
# tenant always finishes its teardown.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/workspace.go'
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_restart.go'
|
||||
- 'workspace-server/internal/registry/healthsweep.go'
|
||||
- 'workspace-server/internal/registry/liveness.go'
|
||||
- 'workspace-server/migrations/**'
|
||||
- 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
|
||||
- 'tests/e2e/test_staging_external_runtime.sh'
|
||||
- '.github/workflows/e2e-staging-external.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/workspace.go'
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_restart.go'
|
||||
- 'workspace-server/internal/registry/healthsweep.go'
|
||||
- 'workspace-server/internal/registry/liveness.go'
|
||||
- 'workspace-server/migrations/**'
|
||||
- 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
|
||||
- 'tests/e2e/test_staging_external_runtime.sh'
|
||||
- '.github/workflows/e2e-staging-external.yml'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
keep_org:
|
||||
description: "Skip teardown for debugging (only via manual dispatch)"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
stale_wait_secs:
|
||||
description: "Seconds to wait for the heartbeat-staleness sweep (default 180 = 90s window + 90s buffer)"
|
||||
required: false
|
||||
default: "180"
|
||||
schedule:
|
||||
- cron: '30 7 * * *'
|
||||
|
||||
concurrency:
|
||||
group: e2e-staging-external
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
e2e-staging-external:
|
||||
name: E2E Staging External Runtime
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 25
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
# Schedule + push triggers must hard-fail when the token is
|
||||
# missing — silent skip would mask infra rot. Manual dispatch
|
||||
# gets the same hard-fail; an operator running this on a fork
|
||||
# without secrets configured needs to know up-front.
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: CP staging health preflight
|
||||
run: |
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
|
||||
exit 1
|
||||
fi
|
||||
echo "Staging CP healthy ✓"
|
||||
|
||||
- name: Run external-runtime E2E
|
||||
id: e2e
|
||||
run: bash tests/e2e/test_staging_external_runtime.sh
|
||||
|
||||
# Mirror the e2e-staging-saas.yml safety net: if the runner is
|
||||
# cancelled (e.g. concurrent staging push), the test script's
|
||||
# EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to
|
||||
# *this* run id.
|
||||
- name: Teardown safety net (runs on cancel/failure)
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, os, datetime
|
||||
run_id = os.environ.get('GITHUB_RUN_ID', '')
|
||||
d = json.load(sys.stdin)
|
||||
# Scope STRICTLY to this run id (e2e-ext-YYYYMMDD-<runid>-...)
|
||||
# so concurrent runs and unrelated dev probes are not touched.
|
||||
# Sweep today AND yesterday so a midnight-crossing run still
|
||||
# cleans up its own slug.
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
|
||||
if not run_id:
|
||||
# Without a run id we cannot scope safely; bail rather
|
||||
# than risk deleting unrelated tenants.
|
||||
sys.exit(0)
|
||||
prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates)
|
||||
for o in d.get('orgs', []):
|
||||
s = o.get('slug', '')
|
||||
if s.startswith(prefixes) and o.get('status') != 'purged':
|
||||
print(s)
|
||||
" 2>/dev/null)
|
||||
if [ -n "$orgs" ]; then
|
||||
echo "Safety-net sweep: deleting leftover orgs:"
|
||||
echo "$orgs"
|
||||
# Per-slug verified DELETE — see molecule-controlplane#420.
|
||||
# `>/dev/null 2>&1` previously hid every failure; surface
|
||||
# non-2xx as workflow warnings so the run page names what
|
||||
# leaked. Sweeper catches the rest within ~45 min.
|
||||
leaks=()
|
||||
for slug in $orgs; do
|
||||
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
|
||||
# pollution of the captured status (lint-curl-status-capture.yml).
|
||||
set +e
|
||||
curl -sS -o /tmp/external-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/external-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/external-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::external teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/external-cleanup.out 2>/dev/null)"
|
||||
leaks+=("$slug")
|
||||
fi
|
||||
done
|
||||
if [ ${#leaks[@]} -gt 0 ]; then
|
||||
echo "::warning::external teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
else
|
||||
echo "Safety-net sweep: no leftover orgs to clean."
|
||||
fi
|
||||
@@ -1,246 +0,0 @@
|
||||
name: E2E Staging SaaS (full lifecycle)
|
||||
|
||||
# Dedicated workflow that provisions a fresh staging org per run, exercises
|
||||
# the full workspace lifecycle (register → heartbeat → A2A → delegation →
|
||||
# HMA memory → activity → peers), then tears down and asserts leak-free.
|
||||
#
|
||||
# Why a separate workflow (not folded into ci.yml):
|
||||
# - The run takes ~25-35 min (EC2 boot + cloudflared DNS + provision sweeps +
|
||||
# agent bootstrap), way too slow for every PR.
|
||||
# - Needs its own concurrency group so two pushes don't fight over the
|
||||
# same staging org slug prefix.
|
||||
# - Has its own required secrets (session cookie, admin token) that most
|
||||
# PRs don't need to read.
|
||||
#
|
||||
# Triggers:
|
||||
# - Push to main (regression guard)
|
||||
# - workflow_dispatch (manual re-run from UI)
|
||||
# - Nightly cron (catches drift even when no pushes land)
|
||||
# - Changes to any provisioning-critical file under PR review (opt-in
|
||||
# via the same paths watcher that e2e-api.yml uses)
|
||||
|
||||
on:
|
||||
# Trunk-based (Phase 3 of internal#81): main is the only branch.
|
||||
# Previously this fired on staging push too because staging was a
|
||||
# superset of main and ran the gate ahead of auto-promote; with no
|
||||
# staging branch, main is where E2E gates the deploy.
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_provision.go'
|
||||
- 'workspace-server/internal/handlers/a2a_proxy.go'
|
||||
- 'workspace-server/internal/middleware/**'
|
||||
- 'workspace-server/internal/provisioner/**'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- '.github/workflows/e2e-staging-saas.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_provision.go'
|
||||
- 'workspace-server/internal/handlers/a2a_proxy.go'
|
||||
- 'workspace-server/internal/middleware/**'
|
||||
- 'workspace-server/internal/provisioner/**'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- '.github/workflows/e2e-staging-saas.yml'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
runtime:
|
||||
description: "Runtime to test (claude-code [default, MiniMax] | hermes [OpenAI] | langgraph [OpenAI])"
|
||||
required: false
|
||||
default: "claude-code"
|
||||
keep_org:
|
||||
description: "Skip teardown for debugging (only use via manual dispatch!)"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
schedule:
|
||||
# 07:00 UTC every day — catches AMI drift, WorkOS cert rotation,
|
||||
# Cloudflare API regressions, etc. even on quiet days.
|
||||
- cron: '0 7 * * *'
|
||||
|
||||
# Serialize: staging has a finite per-hour org creation quota. Two pushes
|
||||
# landing in quick succession should queue, not race. `cancel-in-progress:
|
||||
# false` mirrors e2e-api.yml — GitHub would otherwise cancel the running
|
||||
# teardown step and leave orphan EC2s.
|
||||
concurrency:
|
||||
group: e2e-staging-saas
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
e2e-staging-saas:
|
||||
name: E2E Staging SaaS
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 45
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
# Single admin-bearer secret drives provision + tenant-token
|
||||
# retrieval + teardown. Configure in
|
||||
# Settings → Secrets and variables → Actions → Repository secrets.
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
# MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched
|
||||
# from hermes+OpenAI default after #2578 (the staging OpenAI key
|
||||
# account went over quota and stayed dead for 36+ hours, taking
|
||||
# the full-lifecycle E2E red on every provisioning-critical push).
|
||||
# claude-code template's `minimax` provider routes
|
||||
# ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads
|
||||
# MINIMAX_API_KEY at boot — separate billing account so an
|
||||
# OpenAI quota collapse no longer wedges the gate. Mirrors the
|
||||
# canary-staging.yml + continuous-synth-e2e.yml migrations.
|
||||
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
|
||||
# Direct-Anthropic alternative for operators who don't want to
|
||||
# set up a MiniMax account (priority below MiniMax — first
|
||||
# non-empty wins in test_staging_full_saas.sh's secrets-injection
|
||||
# block). See #2578 PR comment for the rationale.
|
||||
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
|
||||
# OpenAI fallback — kept wired so an operator-dispatched run with
|
||||
# E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still
|
||||
# exercise the OpenAI path.
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
|
||||
E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
|
||||
# Pin the model when running on the default claude-code path —
|
||||
# the per-runtime default ("sonnet") routes to direct Anthropic
|
||||
# and defeats the cost saving. Operators can override via the
|
||||
# workflow_dispatch flow (no input wired here yet — runtime
|
||||
# override is enough for ad-hoc).
|
||||
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }}
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: Verify LLM key present
|
||||
run: |
|
||||
# Per-runtime key check — claude-code uses MiniMax; hermes /
|
||||
# langgraph (operator-dispatched only) use OpenAI. Hard-fail
|
||||
# rather than soft-skip per #2578's lesson — empty key
|
||||
# silently falls through to the wrong SECRETS_JSON branch and
|
||||
# produces a confusing auth error 5 min later instead of the
|
||||
# clean "secret missing" message at the top.
|
||||
case "${E2E_RUNTIME}" in
|
||||
claude-code)
|
||||
# Either MiniMax OR direct-Anthropic works — first
|
||||
# non-empty wins in the test script's secrets-injection
|
||||
# priority chain.
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
|
||||
required_secret_value="${E2E_MINIMAX_API_KEY}"
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
|
||||
required_secret_value="${E2E_ANTHROPIC_API_KEY}"
|
||||
else
|
||||
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
|
||||
required_secret_value=""
|
||||
fi
|
||||
;;
|
||||
langgraph|hermes)
|
||||
required_secret_name="MOLECULE_STAGING_OPENAI_KEY"
|
||||
required_secret_value="${E2E_OPENAI_API_KEY:-}"
|
||||
;;
|
||||
*)
|
||||
echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
|
||||
required_secret_name=""
|
||||
required_secret_value="present"
|
||||
;;
|
||||
esac
|
||||
if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
|
||||
echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — workspaces will fail at boot with 'No provider API key found'"
|
||||
exit 2
|
||||
fi
|
||||
echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
|
||||
|
||||
- name: CP staging health preflight
|
||||
run: |
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
|
||||
exit 1
|
||||
fi
|
||||
echo "Staging CP healthy ✓"
|
||||
|
||||
- name: Run full-lifecycle E2E
|
||||
id: e2e
|
||||
run: bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
# Belt-and-braces teardown: the test script itself installs a trap
|
||||
# for EXIT/INT/TERM, but if the GH runner itself is cancelled (e.g.
|
||||
# someone pushes a new commit and workflow concurrency is set to
|
||||
# cancel), the trap may not fire. This `always()` step runs even on
|
||||
# cancellation and attempts the delete a second time. The admin
|
||||
# DELETE endpoint is idempotent so double-invoking is safe.
|
||||
- name: Teardown safety net (runs on cancel/failure)
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
run: |
|
||||
# Best-effort: find any e2e-YYYYMMDD-* orgs matching this run and
|
||||
# nuke them. Catches the case where the script died before
|
||||
# exporting its slug.
|
||||
set +e
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, os, datetime
|
||||
run_id = os.environ.get('GITHUB_RUN_ID', '')
|
||||
d = json.load(sys.stdin)
|
||||
# ONLY sweep slugs from *this* CI run. Previously the filter was
|
||||
# f'e2e-{today}-' which stomped on parallel CI runs AND any manual
|
||||
# E2E probes a dev was running against staging (incident 2026-04-21
|
||||
# 15:02Z: this workflow's safety net deleted an unrelated manual
|
||||
# run's tenant 1s after it hit 'running').
|
||||
# Sweep both today AND yesterday's UTC dates so a run that crosses
|
||||
# midnight still matches its own slug — see the 2026-04-26→27
|
||||
# canvas-safety-net incident for the same bug class.
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
|
||||
if run_id:
|
||||
prefixes = tuple(f'e2e-{d}-{run_id}-' for d in dates)
|
||||
else:
|
||||
prefixes = tuple(f'e2e-{d}-' for d in dates)
|
||||
candidates = [o['slug'] for o in d.get('orgs', [])
|
||||
if any(o.get('slug','').startswith(p) for p in prefixes)
|
||||
and o.get('instance_status') not in ('purged',)]
|
||||
print('\n'.join(candidates))
|
||||
" 2>/dev/null)
|
||||
# Per-slug verified DELETE (was `>/dev/null || true` — see
|
||||
# molecule-controlplane#420). Surface non-2xx as a workflow
|
||||
# warning naming the leaked slug; don't exit 1 (sweeper is
|
||||
# the safety net within ~45 min).
|
||||
leaks=()
|
||||
for slug in $orgs; do
|
||||
echo "Safety-net teardown: $slug"
|
||||
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
|
||||
# pollution of the captured status (lint-curl-status-capture.yml).
|
||||
set +e
|
||||
curl -sS -o /tmp/saas-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/saas-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/saas-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::saas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/saas-cleanup.out 2>/dev/null)"
|
||||
leaks+=("$slug")
|
||||
fi
|
||||
done
|
||||
if [ ${#leaks[@]} -gt 0 ]; then
|
||||
echo "::warning::saas teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
exit 0
|
||||
@@ -1,171 +0,0 @@
|
||||
name: E2E Staging Sanity (leak-detection self-check)
|
||||
|
||||
# Periodic assertion that the teardown safety nets in e2e-staging-saas
|
||||
# and canary-staging actually work. Runs the E2E harness with
|
||||
# E2E_INTENTIONAL_FAILURE=1, which poisons the tenant admin token after
|
||||
# the org is provisioned. The workspace-provision step then fails, the
|
||||
# script exits non-zero, and the EXIT trap + workflow always()-step
|
||||
# must still tear down cleanly.
|
||||
#
|
||||
# A green run means:
|
||||
# - The script exited non-zero (intentional failure caught)
|
||||
# - The trap fired teardown
|
||||
# - The leak-detection poll found zero orphan orgs
|
||||
#
|
||||
# A red run means the teardown path itself is broken — act on this the
|
||||
# same way you'd act on a canary failure (the whole E2E safety net is
|
||||
# compromised until it's fixed).
|
||||
#
|
||||
# Cadence: once a week, Monday 06:00 UTC. Drift-slow, not per-PR — the
|
||||
# teardown path rarely changes, and a weekly heartbeat is enough to
|
||||
# catch silent regressions in cleanup code paths.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * 1'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
# Shares the group with canary + full so they don't collide on
|
||||
# staging org-create quota.
|
||||
group: e2e-staging-sanity
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
issues: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sanity:
|
||||
name: Intentional-failure teardown sanity
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 20
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
E2E_MODE: canary # lean lifecycle; we only need the org to exist
|
||||
E2E_RUNTIME: hermes
|
||||
E2E_RUN_ID: "sanity-${{ github.run_id }}"
|
||||
E2E_INTENTIONAL_FAILURE: "1"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Inverted assertion: the run MUST fail. If it passes, the
|
||||
# E2E_INTENTIONAL_FAILURE path is broken (token not being
|
||||
# poisoned correctly, or the harness silently recovered).
|
||||
- name: Run harness — expecting exit !=0
|
||||
id: harness
|
||||
run: |
|
||||
set +e
|
||||
bash tests/e2e/test_staging_full_saas.sh
|
||||
rc=$?
|
||||
echo "harness_rc=$rc" >> "$GITHUB_OUTPUT"
|
||||
# The only acceptable outcomes:
|
||||
# 1 — harness failed mid-run, teardown ran, leak-check passed
|
||||
# (exit 4 means teardown left a leak — that's the real bug
|
||||
# this sanity check exists to catch)
|
||||
if [ "$rc" = "1" ]; then
|
||||
echo "✓ Harness failed as expected (rc=1); teardown trap ran, leak-check passed"
|
||||
exit 0
|
||||
elif [ "$rc" = "0" ]; then
|
||||
echo "::error::Harness succeeded under E2E_INTENTIONAL_FAILURE=1 — the poisoning path is broken"
|
||||
exit 1
|
||||
elif [ "$rc" = "4" ]; then
|
||||
echo "::error::LEAK DETECTED (rc=4) — teardown failed to clean up the org. Safety net broken."
|
||||
exit 4
|
||||
else
|
||||
echo "::error::Unexpected rc=$rc — neither clean-failure nor leak. Investigate harness."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Open issue if safety net is broken
|
||||
if: failure()
|
||||
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
||||
with:
|
||||
script: |
|
||||
const title = "🚨 E2E teardown safety net broken";
|
||||
const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
||||
const body =
|
||||
`The weekly sanity run (E2E_INTENTIONAL_FAILURE=1) did not exit ` +
|
||||
`as expected. This means one of:\n` +
|
||||
` - poisoning didn't actually cause failure (test harness regression), OR\n` +
|
||||
` - teardown left an orphan org (leak detection caught a real bug)\n\n` +
|
||||
`Run: ${runURL}\n\n` +
|
||||
`This is higher priority than a canary failure — the whole ` +
|
||||
`E2E safety net can't be trusted until this is resolved.`;
|
||||
|
||||
const { data: existing } = await github.rest.issues.listForRepo({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
state: 'open', labels: 'e2e-safety-net',
|
||||
});
|
||||
const match = existing.find(i => i.title === title);
|
||||
if (match) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
issue_number: match.number,
|
||||
body: `Still broken. ${runURL}`,
|
||||
});
|
||||
} else {
|
||||
await github.rest.issues.create({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
title, body,
|
||||
labels: ['e2e-safety-net', 'bug', 'priority-high'],
|
||||
});
|
||||
}
|
||||
|
||||
# Belt-and-braces: if teardown left anything behind, nuke it here
|
||||
# so we don't bleed staging quota. Different label from the
|
||||
# always()-steps in the other workflows so sanity-only orgs get
|
||||
# cleaned up by sanity runs.
|
||||
- name: Teardown safety net
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
today = __import__('datetime').date.today().strftime('%Y%m%d')
|
||||
candidates = [o['slug'] for o in d.get('orgs', [])
|
||||
if o.get('slug','').startswith(f'e2e-canary-{today}-sanity-')
|
||||
and o.get('status') not in ('purged',)]
|
||||
print('\n'.join(candidates))
|
||||
" 2>/dev/null)
|
||||
# Per-slug verified DELETE — see molecule-controlplane#420.
|
||||
# Failures surface as workflow warnings; the sweeper is the
|
||||
# safety net within ~45 min.
|
||||
leaks=()
|
||||
for slug in $orgs; do
|
||||
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
|
||||
# pollution of the captured status (lint-curl-status-capture.yml).
|
||||
set +e
|
||||
curl -sS -o /tmp/sanity-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/sanity-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/sanity-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::sanity teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/sanity-cleanup.out 2>/dev/null)"
|
||||
leaks+=("$slug")
|
||||
fi
|
||||
done
|
||||
if [ ${#leaks[@]} -gt 0 ]; then
|
||||
echo "::warning::sanity teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
exit 0
|
||||
@@ -1,251 +0,0 @@
|
||||
name: Handlers Postgres Integration
|
||||
|
||||
# Real-Postgres integration tests for workspace-server/internal/handlers/.
|
||||
# Triggered on every PR/push that touches the handlers package.
|
||||
#
|
||||
# Why this workflow exists
|
||||
# ------------------------
|
||||
# Strict-sqlmock unit tests pin which SQL statements fire — they're fast
|
||||
# and let us iterate without a DB. But sqlmock CANNOT detect bugs that
|
||||
# depend on the row state AFTER the SQL runs. The result_preview-lost
|
||||
# bug shipped to staging in PR #2854 because every unit test was
|
||||
# satisfied with "an UPDATE statement fired" — none verified the row's
|
||||
# preview field actually landed. The local-postgres E2E that retrofit
|
||||
# self-review caught it took 2 minutes to set up and would have caught
|
||||
# the bug at PR-time.
|
||||
#
|
||||
# Why this workflow does NOT use `services: postgres:` (Class B fix)
|
||||
# ------------------------------------------------------------------
|
||||
# Our act_runner config has `container.network: host` (operator host
|
||||
# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
|
||||
# the job container AND every service container. With host-net, two
|
||||
# concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
|
||||
# second postgres FATALs with `could not create any TCP/IP sockets:
|
||||
# Address in use`, and Docker auto-removes it (act_runner sets
|
||||
# AutoRemove:true on service containers). By the time the migrations
|
||||
# step runs `psql`, the postgres container is gone, hence
|
||||
# `Connection refused` then `failed to remove container: No such
|
||||
# container` at cleanup time.
|
||||
#
|
||||
# Per-job `container.network` override is silently ignored by
|
||||
# act_runner — `--network and --net in the options will be ignored.`
|
||||
# appears in the runner log. Documented constraint.
|
||||
#
|
||||
# So we sidestep `services:` entirely. The job container still uses
|
||||
# host-net (inherited from runner config; required for cache server
|
||||
# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
|
||||
# postgres on the existing `molecule-core-net` bridge with a
|
||||
# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
|
||||
# read its bridge IP via `docker inspect`. A host-net job container
|
||||
# can reach a bridge-net container directly via the bridge IP (verified
|
||||
# manually on operator host 2026-05-08).
|
||||
#
|
||||
# Trade-offs vs. the original `services:` shape:
|
||||
# + No host-port collision; N parallel runs share the bridge cleanly
|
||||
# + `if: always()` cleanup runs even on test-step failure
|
||||
# - One more step in the workflow (+~3 lines)
|
||||
# - Requires `molecule-core-net` to exist on the operator host
|
||||
# (it does; declared in docker-compose.yml + docker-compose.infra.yml)
|
||||
#
|
||||
# Class B Hongming-owned CICD red sweep, 2026-05-08.
|
||||
#
|
||||
# Cost: ~30s job (postgres pull from cache + go build + 4 tests).
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: handlers-pg-integ-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
detect-changes:
|
||||
name: detect-changes
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
handlers: ${{ steps.filter.outputs.handlers }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
handlers:
|
||||
- 'workspace-server/internal/handlers/**'
|
||||
- 'workspace-server/internal/wsauth/**'
|
||||
- 'workspace-server/migrations/**'
|
||||
- '.github/workflows/handlers-postgres-integration.yml'
|
||||
|
||||
# Single-job-with-per-step-if pattern: always runs to satisfy the
|
||||
# required-check name on branch protection; real work gates on the
|
||||
# paths filter. See ci.yml's Platform (Go) for the same shape.
|
||||
integration:
|
||||
name: Handlers Postgres Integration
|
||||
needs: detect-changes
|
||||
runs-on: docker-host
|
||||
env:
|
||||
# Unique name per run so concurrent jobs don't collide on the
|
||||
# bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
|
||||
# workflow_dispatch reruns of the same run_id.
|
||||
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
# Bridge network already exists on the operator host (declared
|
||||
# in docker-compose.yml + docker-compose.infra.yml).
|
||||
PG_NETWORK: molecule-core-net
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
steps:
|
||||
- if: needs.detect-changes.outputs.handlers != 'true'
|
||||
working-directory: .
|
||||
run: echo "No handlers/migrations changes — skipping; this job always runs to satisfy the required-check name."
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
with:
|
||||
go-version: 'stable'
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Start sibling Postgres on bridge network
|
||||
working-directory: .
|
||||
run: |
|
||||
# Sanity: the bridge network must exist on the operator host.
|
||||
# Hard-fail loud if it doesn't — easier to spot than a silent
|
||||
# auto-create that diverges from the rest of the stack.
|
||||
if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
|
||||
echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If a stale container with the same name exists (rerun on
|
||||
# the same run_id), wipe it first.
|
||||
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
||||
|
||||
docker run -d \
|
||||
--name "${PG_NAME}" \
|
||||
--network "${PG_NETWORK}" \
|
||||
--health-cmd "pg_isready -U postgres" \
|
||||
--health-interval 5s \
|
||||
--health-timeout 5s \
|
||||
--health-retries 10 \
|
||||
-e POSTGRES_PASSWORD=test \
|
||||
-e POSTGRES_DB=molecule \
|
||||
postgres:15-alpine >/dev/null
|
||||
|
||||
# Read back the bridge IP. Always present immediately after
|
||||
# `docker run -d` for bridge networks.
|
||||
PG_HOST=$(docker inspect "${PG_NAME}" \
|
||||
--format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
|
||||
if [ -z "${PG_HOST}" ]; then
|
||||
echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
|
||||
docker logs "${PG_NAME}" || true
|
||||
exit 1
|
||||
fi
|
||||
echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
|
||||
echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
|
||||
echo "Started ${PG_NAME} at ${PG_HOST}:5432"
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Apply migrations to Postgres service
|
||||
env:
|
||||
PGPASSWORD: test
|
||||
run: |
|
||||
# Wait for postgres to actually accept connections. Docker's
|
||||
# health-cmd handles container-side readiness, but the wire
|
||||
# to the bridge IP is best-tested with pg_isready directly.
|
||||
for i in {1..15}; do
|
||||
if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
|
||||
echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
|
||||
done
|
||||
|
||||
# Apply every .up.sql in lexicographic order with
|
||||
# ON_ERROR_STOP=0 — failing migrations are SKIPPED rather than
|
||||
# blocking the suite. This handles the current schema state
|
||||
# where a few historical migrations (e.g. 017_memories_fts_*)
|
||||
# depend on tables that were later renamed/dropped and so
|
||||
# cannot replay from scratch. The migrations that DO succeed
|
||||
# land their tables, which is sufficient for the integration
|
||||
# tests in handlers/.
|
||||
#
|
||||
# Why not maintain a curated allowlist: every new migration
|
||||
# touching a handlers/-tested table would have to update this
|
||||
# workflow. With apply-all-or-skip, a future migration that
|
||||
# adds a column to delegations runs automatically (its base
|
||||
# table 049_delegations.up.sql already succeeded above it in
|
||||
# the order). Operators only need to revisit this if the
|
||||
# migration chain becomes legitimately replayable end-to-end.
|
||||
#
|
||||
# Per-migration result is logged so a failed migration that
|
||||
# SHOULD have been replayable surfaces in the CI log instead
|
||||
# of silently failing.
|
||||
# Apply both *.sql (legacy, lives next to its module) and
|
||||
# *.up.sql (newer up/down convention) in a single
|
||||
# lexicographically-sorted pass. Excluding *.down.sql so the
|
||||
# newest-naming-convention pairs don't undo themselves mid-run.
|
||||
# Pre-#149-followup this loop only globbed *.up.sql, which
|
||||
# silently skipped 001_workspaces.sql + 009_activity_logs.sql
|
||||
# — fine while no integration test depended on those tables,
|
||||
# not fine once a cross-table atomicity test came in.
|
||||
set +e
|
||||
for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
|
||||
if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
|
||||
-f "$migration" >/dev/null 2>&1; then
|
||||
echo "✓ $(basename "$migration")"
|
||||
else
|
||||
echo "⊘ $(basename "$migration") (skipped — see comment in workflow)"
|
||||
fi
|
||||
done
|
||||
set -e
|
||||
|
||||
# Sanity: the delegations + workspaces + activity_logs tables
|
||||
# MUST exist for the integration tests to be meaningful. Hard-
|
||||
# fail if any didn't land — that would be a real regression we
|
||||
# want loud.
|
||||
for tbl in delegations workspaces activity_logs pending_uploads; do
|
||||
if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
|
||||
-c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
|
||||
| grep -q 1; then
|
||||
echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
|
||||
exit 1
|
||||
fi
|
||||
echo "✓ $tbl table present"
|
||||
done
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Run integration tests
|
||||
run: |
|
||||
# INTEGRATION_DB_URL is exported by the start-postgres step;
|
||||
# points at the per-run bridge IP, not 127.0.0.1, so concurrent
|
||||
# workflow runs don't fight over a host-net 5432 port.
|
||||
go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
|
||||
|
||||
- if: failure() && needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Diagnostic dump on failure
|
||||
env:
|
||||
PGPASSWORD: test
|
||||
run: |
|
||||
echo "::group::postgres container status"
|
||||
docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
|
||||
docker logs "${PG_NAME}" 2>&1 | tail -50 || true
|
||||
echo "::endgroup::"
|
||||
echo "::group::delegations table state"
|
||||
psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
|
||||
echo "::endgroup::"
|
||||
|
||||
- if: always() && needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Stop sibling Postgres
|
||||
working-directory: .
|
||||
run: |
|
||||
# always() so containers don't leak when migrations or tests
|
||||
# fail. The cleanup is best-effort: if the container is
|
||||
# already gone (e.g. concurrent rerun race), don't fail the job.
|
||||
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
||||
echo "Cleaned up ${PG_NAME}"
|
||||
@@ -1,248 +0,0 @@
|
||||
name: Harness Replays
|
||||
|
||||
# Boots tests/harness (production-shape compose topology with TenantGuard,
|
||||
# /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
|
||||
# every replay under tests/harness/replays/. Fails the PR if any replay
|
||||
# fails.
|
||||
#
|
||||
# Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
|
||||
# a public route in router.go but forgot to add it to TenantGuard's
|
||||
# allowlist. The handler-level test in buildinfo_test.go constructed a
|
||||
# minimal gin engine without TenantGuard — green. The harness's
|
||||
# buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
|
||||
# inject X-Molecule-Org-Id, so the curl path is identical to production's
|
||||
# redeploy verifier), but no one ran the harness pre-merge. The bug
|
||||
# shipped; the redeploy verifier silently soft-warned every tenant as
|
||||
# "unreachable" for ~1 day before being noticed.
|
||||
#
|
||||
# This gate makes "did you actually run the harness?" a CI invariant
|
||||
# instead of a memory-discipline thing.
|
||||
#
|
||||
# Trigger model — match e2e-api.yml: always FIRES on push/pull_request
|
||||
# to staging+main, real work is gated per-step on detect-changes output.
|
||||
# One job → one check run → branch-protection-clean (the SKIPPED-in-set
|
||||
# trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'workspace-server/**'
|
||||
- 'canvas/**'
|
||||
- 'tests/harness/**'
|
||||
- '.github/workflows/harness-replays.yml'
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'workspace-server/**'
|
||||
- 'canvas/**'
|
||||
- 'tests/harness/**'
|
||||
- '.github/workflows/harness-replays.yml'
|
||||
workflow_dispatch:
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
concurrency:
|
||||
# Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
|
||||
# cancellation deadlock — see e2e-api.yml's concurrency block for
|
||||
# the 2026-04-28 incident that codified this pattern.
|
||||
group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
detect-changes:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
run: ${{ steps.decide.outputs.run }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- id: decide
|
||||
run: |
|
||||
# workflow_dispatch: always run (manual trigger)
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "run=true" >> "$GITHUB_OUTPUT"
|
||||
echo "debug=manual-trigger" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Determine the base commit to diff against.
|
||||
# For pull_request: use base.sha (the merge-base with main/staging).
|
||||
# For push: use github.event.before (the previous tip of the branch).
|
||||
# Fallback for new branches (all-zeros SHA): run everything.
|
||||
if [ "${{ github.event_name }}" = "pull_request" ] && \
|
||||
[ -n "${{ github.event.pull_request.base.sha }}" ]; then
|
||||
BASE="${{ github.event.pull_request.base.sha }}"
|
||||
elif [ -n "${{ github.event.before }}" ] && \
|
||||
! echo "${{ github.event.before }}" | grep -qE '^0+$'; then
|
||||
BASE="${{ github.event.before }}"
|
||||
else
|
||||
# New branch or github.event.before unavailable — run everything.
|
||||
echo "run=true" >> "$GITHUB_OUTPUT"
|
||||
echo "debug=new-branch-fallback" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# GitHub Actions and Gitea Actions both expose github.sha for HEAD.
|
||||
DIFF=$(git diff --name-only "$BASE" "${{ github.sha }}" 2>/dev/null)
|
||||
echo "debug=diff-base=$BASE diff-files=$DIFF" >> "$GITHUB_OUTPUT"
|
||||
|
||||
if echo "$DIFF" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.github/workflows/harness-replays\.yml$'; then
|
||||
echo "run=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "run=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# ONE job that always runs. Real work is gated per-step on
|
||||
# detect-changes.outputs.run so an unrelated PR (e.g. doc-only
|
||||
# change to molecule-controlplane wired here later) emits the
|
||||
# required check without spending CI cycles. Single-job pattern
|
||||
# matches e2e-api.yml — see that workflow's comment for why a
|
||||
# job-level `if: false` would block branch protection via the
|
||||
# SKIPPED-in-set bug.
|
||||
harness-replays:
|
||||
needs: detect-changes
|
||||
name: Harness Replays
|
||||
runs-on: docker-host
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: No-op pass (paths filter excluded this commit)
|
||||
if: needs.detect-changes.outputs.run != 'true'
|
||||
run: |
|
||||
echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
|
||||
echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
|
||||
echo "::notice::Debug: ${{ needs.detect-changes.outputs.debug }}"
|
||||
|
||||
- if: needs.detect-changes.outputs.run == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
# Log what files were detected so future failures include the diff.
|
||||
- name: Log detected changes
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
run: |
|
||||
echo "::notice::detect-changes debug: ${{ needs.detect-changes.outputs.debug }}"
|
||||
|
||||
# github-app-auth sibling-checkout removed 2026-05-07 (#157):
|
||||
# the plugin was dropped + Dockerfile.tenant no longer COPYs it.
|
||||
|
||||
# Pre-clone manifest deps before docker compose builds the tenant
|
||||
# image (Task #173 followup — same pattern as
|
||||
# publish-workspace-server-image.yml's "Pre-clone manifest deps"
|
||||
# step).
|
||||
#
|
||||
# Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha
|
||||
# and tenant-beta from workspace-server/Dockerfile.tenant with
|
||||
# context=../.. (repo root). That Dockerfile expects
|
||||
# .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins}
|
||||
# to be present at build context root (post-#173 it COPYs from there
|
||||
# instead of running an in-image clone — the in-image clone failed
|
||||
# with "could not read Username for https://git.moleculesai.app"
|
||||
# because there's no auth path inside the build sandbox).
|
||||
#
|
||||
# Without this step harness-replays fails before any replay runs,
|
||||
# with `failed to calculate checksum of ref ...
|
||||
# "/.tenant-bundle-deps/plugins": not found`. Caught by run #892
|
||||
# (main, 2026-05-07T20:28:53Z) and run #964 (staging — same
|
||||
# symptom, different root cause: staging still has the in-image
|
||||
# clone path, hits the auth error directly).
|
||||
#
|
||||
# 2026-05-08 sub-finding (#192): the clone step ALSO fails when
|
||||
# any referenced workspace-template repo is private and the
|
||||
# AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read
|
||||
# access. Root cause: 5 of 9 workspace-template repos
|
||||
# (openclaw, codex, crewai, deepagents, gemini-cli) had been
|
||||
# marked private with no team grant. Resolution: flipped them
|
||||
# to public per `feedback_oss_first_repo_visibility_default`
|
||||
# (the OSS surface should be public). Layer-3 (customer-private +
|
||||
# marketplace third-party repos) tracked separately in
|
||||
# internal#102.
|
||||
#
|
||||
# Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
|
||||
# is the devops-engineer persona PAT, NOT the founder PAT (per
|
||||
# `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
|
||||
# embeds it as basic-auth for the duration of the clones and strips
|
||||
# .git directories — the token never enters the resulting image.
|
||||
- name: Pre-clone manifest deps
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
env:
|
||||
MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
|
||||
echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p .tenant-bundle-deps
|
||||
bash scripts/clone-manifest.sh \
|
||||
manifest.json \
|
||||
.tenant-bundle-deps/workspace-configs-templates \
|
||||
.tenant-bundle-deps/org-templates \
|
||||
.tenant-bundle-deps/plugins
|
||||
# Sanity-check counts so a silent partial clone fails fast
|
||||
# instead of producing a half-empty image.
|
||||
ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
|
||||
org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
|
||||
plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
|
||||
echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
|
||||
|
||||
- name: Install Python deps for replays
|
||||
# peer-discovery-404 (and future replays) eval Python against the
|
||||
# running tenant — importing workspace/a2a_client.py pulls in
|
||||
# httpx. tests/harness/requirements.txt holds just the HTTP-client
|
||||
# surface to keep CI install fast (~3s) vs the full
|
||||
# workspace/requirements.txt (~30s).
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
run: pip install -r tests/harness/requirements.txt
|
||||
|
||||
- name: Run all replays against the harness
|
||||
# run-all-replays.sh: boot via up.sh → seed via seed.sh → run
|
||||
# every replays/*.sh → tear down via down.sh on EXIT (trap).
|
||||
# Non-zero exit on any replay failure.
|
||||
#
|
||||
# KEEP_UP=1: without this, the script's trap-on-EXIT tears
|
||||
# down containers immediately on failure, leaving the dump
|
||||
# step below with nothing to dump (verified on PR #2410's
|
||||
# first run — tenant became unhealthy, trap fired, dump
|
||||
# step saw empty containers). Keeping them up lets the
|
||||
# failure path collect tenant/cp-stub/cf-proxy logs. The
|
||||
# always-run "Force teardown" step does the actual cleanup.
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
working-directory: tests/harness
|
||||
env:
|
||||
KEEP_UP: "1"
|
||||
run: ./run-all-replays.sh
|
||||
|
||||
- name: Dump compose logs on failure
|
||||
# SECRETS_ENCRYPTION_KEY: docker compose validates the entire compose
|
||||
# file even for read-only `logs` calls. up.sh generates a per-run key
|
||||
# and exports it to its OWN shell — this step runs in a fresh shell
|
||||
# that wouldn't see it, so without a placeholder the validate step
|
||||
# errors before logs print (verified against PR #2492's first run:
|
||||
# "required variable SECRETS_ENCRYPTION_KEY is missing a value").
|
||||
# A placeholder is fine — we're only reading log streams, not booting.
|
||||
if: failure() && needs.detect-changes.outputs.run == 'true'
|
||||
working-directory: tests/harness
|
||||
env:
|
||||
SECRETS_ENCRYPTION_KEY: dump-logs-placeholder
|
||||
run: |
|
||||
echo "=== docker compose ps ==="
|
||||
docker compose -f compose.yml ps || true
|
||||
echo "=== tenant-alpha logs ==="
|
||||
docker compose -f compose.yml logs tenant-alpha || true
|
||||
echo "=== tenant-beta logs ==="
|
||||
docker compose -f compose.yml logs tenant-beta || true
|
||||
echo "=== cp-stub logs ==="
|
||||
docker compose -f compose.yml logs cp-stub || true
|
||||
echo "=== cf-proxy logs ==="
|
||||
docker compose -f compose.yml logs cf-proxy || true
|
||||
echo "=== postgres-alpha logs (last 100) ==="
|
||||
docker compose -f compose.yml logs --tail 100 postgres-alpha || true
|
||||
echo "=== postgres-beta logs (last 100) ==="
|
||||
docker compose -f compose.yml logs --tail 100 postgres-beta || true
|
||||
|
||||
- name: Force teardown
|
||||
# We pass KEEP_UP=1 to run-all-replays.sh so the dump step
|
||||
# above sees real containers — that means we own teardown
|
||||
# explicitly here. Always run.
|
||||
if: always() && needs.detect-changes.outputs.run == 'true'
|
||||
working-directory: tests/harness
|
||||
run: ./down.sh || true
|
||||
@@ -1,94 +0,0 @@
|
||||
name: Lint curl status-code capture
|
||||
|
||||
# Pins the workflow-bash anti-pattern that produced "HTTP 000000" on the
|
||||
# 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6:
|
||||
#
|
||||
# HTTP_CODE=$(curl ... -w '%{http_code}' ... || echo "000")
|
||||
#
|
||||
# When curl exits non-zero (connection reset → 56, --fail-with-body 4xx/5xx
|
||||
# → 22), the `-w '%{http_code}'` already wrote a status to stdout — usually
|
||||
# "000" for connection failures or the actual code for HTTP errors. The
|
||||
# `|| echo "000"` then fires AND appends ANOTHER "000" to the captured
|
||||
# stdout, producing values like "000000" or "409000" that fail string
|
||||
# comparisons against "200" while looking superficially right.
|
||||
#
|
||||
# Same class of bug the synth-E2E §7c gate hit twice (PRs #2779/#2783 +
|
||||
# #2797). Memory: feedback_curl_status_capture_pollution.md.
|
||||
#
|
||||
# Fix shape (route -w into a tempfile so curl's exit code can't pollute):
|
||||
#
|
||||
# set +e
|
||||
# curl ... -w '%{http_code}' >code.txt 2>/dev/null
|
||||
# set -e
|
||||
# HTTP_CODE=$(cat code.txt 2>/dev/null)
|
||||
# [ -z "$HTTP_CODE" ] && HTTP_CODE="000"
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths: ['.github/workflows/**']
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths: ['.github/workflows/**']
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
jobs:
|
||||
scan:
|
||||
name: Scan workflows for curl status-capture pollution
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- name: Find curl ... -w '%{http_code}' ... || echo "000" subshells
|
||||
run: |
|
||||
set -uo pipefail
|
||||
# Multi-line aware: look for `$(curl ... -w '%{http_code}' ... || echo "000")`
|
||||
# subshell where the entire command-substitution wraps a curl that
|
||||
# ends with `|| echo "000"`. Must distinguish from the SAFE shape
|
||||
# `$(cat tempfile 2>/dev/null || echo "000")` — `cat` with a missing
|
||||
# tempfile produces empty stdout, no pollution.
|
||||
python3 <<'PY'
|
||||
import os, re, sys, glob
|
||||
|
||||
BAD_FILES = []
|
||||
|
||||
# Match the buggy substitution across newlines: $(curl ... -w '%{http_code}' ... || echo "000")
|
||||
# The `\\n` is the bash line-continuation that lets curl flags span lines.
|
||||
# We collapse continuation lines first, then look for the single-line bad pattern.
|
||||
PATTERN = re.compile(
|
||||
r'\$\(\s*curl\b[^)]*-w\s*[\'"]%\{http_code\}[\'"][^)]*\|\|\s*echo\s+"000"\s*\)',
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
# Self-skip: this lint workflow contains the literal anti-pattern in
|
||||
# its own docstring — that's intentional, not a bug.
|
||||
SELF = ".github/workflows/lint-curl-status-capture.yml"
|
||||
|
||||
for f in sorted(glob.glob(".github/workflows/*.yml")):
|
||||
if f == SELF:
|
||||
continue
|
||||
with open(f) as fh:
|
||||
content = fh.read()
|
||||
# Collapse bash line-continuations (\\\n + leading whitespace)
|
||||
# into a single logical line so the regex can see the full
|
||||
# curl invocation as one chunk.
|
||||
flat = re.sub(r'\\\s*\n\s*', ' ', content)
|
||||
for m in PATTERN.finditer(flat):
|
||||
BAD_FILES.append((f, m.group(0)[:120]))
|
||||
|
||||
if not BAD_FILES:
|
||||
print("✓ No curl-status-capture pollution patterns detected")
|
||||
sys.exit(0)
|
||||
|
||||
print(f"::error::Found {len(BAD_FILES)} curl-status-capture pollution site(s):")
|
||||
for f, snippet in BAD_FILES:
|
||||
print(f"::error file={f}::Curl status-capture pollution: '|| echo \"000\"' inside a $(curl ... -w '%{{http_code}}' ...) subshell. On non-2xx or connection failure, curl's -w writes a status, then exits non-zero, then the || echo appends another '000' — producing 'HTTP 000000' or '409000' that fails comparisons silently. Fix: route -w into a tempfile so the exit code can't pollute stdout. See memory feedback_curl_status_capture_pollution.md.")
|
||||
print(f" matched: {snippet}…")
|
||||
print()
|
||||
print("Fix template:")
|
||||
print(' set +e')
|
||||
print(' curl ... -w \'%{http_code}\' >code.txt 2>/dev/null')
|
||||
print(' set -e')
|
||||
print(' HTTP_CODE=$(cat code.txt 2>/dev/null)')
|
||||
print(' [ -z "$HTTP_CODE" ] && HTTP_CODE="000"')
|
||||
sys.exit(1)
|
||||
PY
|
||||
@@ -1,121 +0,0 @@
|
||||
name: publish-canvas-image
|
||||
|
||||
# Builds and pushes the canvas Docker image to GHCR whenever a commit lands
|
||||
# on main that touches canvas code. Previously canvas changes were visible in
|
||||
# CI (npm run build passed) but the live container was never updated —
|
||||
# operators had to manually run `docker compose build canvas` each time.
|
||||
#
|
||||
# Mirror of publish-platform-image.yml, adapted for the Next.js canvas layer.
|
||||
# See that workflow for inline notes on macOS Keychain isolation and QEMU.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
# Only rebuild when canvas source changes — saves GHA minutes on
|
||||
# platform-only / docs-only / MCP-only merges.
|
||||
- 'canvas/**'
|
||||
- '.github/workflows/publish-canvas-image.yml'
|
||||
# Manual trigger: use after a non-canvas merge that still needs a fresh
|
||||
# image (e.g. a Dockerfile change lives outside the canvas/ tree).
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
platform_url:
|
||||
description: 'NEXT_PUBLIC_PLATFORM_URL baked into the bundle (default: http://localhost:8080)'
|
||||
required: false
|
||||
default: ''
|
||||
ws_url:
|
||||
description: 'NEXT_PUBLIC_WS_URL baked into the bundle (default: ws://localhost:8080/ws)'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write # required to push to ghcr.io/${{ github.repository_owner }}/*
|
||||
|
||||
env:
|
||||
IMAGE_NAME: ghcr.io/molecule-ai/canvas
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
name: Build & push canvas image
|
||||
runs-on: publish
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
|
||||
|
||||
# Health check: verify Docker daemon is accessible before attempting any
|
||||
# build steps. This fails loudly at step 1 when the runner's docker.sock
|
||||
# is inaccessible rather than silently continuing to the build step
|
||||
# where docker build fails deep in ECR auth with a cryptic error.
|
||||
- name: Verify Docker daemon access
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "::group::Docker daemon health check"
|
||||
docker info 2>&1 | head -5 || {
|
||||
echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
|
||||
echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
|
||||
exit 1
|
||||
}
|
||||
echo "Docker daemon OK"
|
||||
echo "::endgroup::"
|
||||
|
||||
- name: Compute tags
|
||||
id: tags
|
||||
shell: bash
|
||||
run: |
|
||||
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Resolve build args
|
||||
id: build_args
|
||||
# Priority: workflow_dispatch input > repo secret > hardcoded default.
|
||||
# NEXT_PUBLIC_* env vars are baked into the JS bundle at build time by
|
||||
# Next.js — they cannot be changed at runtime without a full rebuild.
|
||||
# For local docker-compose deployments the defaults (localhost:8080)
|
||||
# work as-is; production deployments should set CANVAS_PLATFORM_URL
|
||||
# and CANVAS_WS_URL as repository secrets.
|
||||
#
|
||||
# Inputs are passed via env vars (not direct ${{ }} interpolation) to
|
||||
# prevent shell injection from workflow_dispatch string inputs.
|
||||
shell: bash
|
||||
env:
|
||||
INPUT_PLATFORM_URL: ${{ github.event.inputs.platform_url }}
|
||||
SECRET_PLATFORM_URL: ${{ secrets.CANVAS_PLATFORM_URL }}
|
||||
INPUT_WS_URL: ${{ github.event.inputs.ws_url }}
|
||||
SECRET_WS_URL: ${{ secrets.CANVAS_WS_URL }}
|
||||
run: |
|
||||
PLATFORM_URL="${INPUT_PLATFORM_URL:-${SECRET_PLATFORM_URL:-http://localhost:8080}}"
|
||||
WS_URL="${INPUT_WS_URL:-${SECRET_WS_URL:-ws://localhost:8080/ws}}"
|
||||
|
||||
echo "platform_url=${PLATFORM_URL}" >> "$GITHUB_OUTPUT"
|
||||
echo "ws_url=${WS_URL}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Build & push canvas image to GHCR
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
|
||||
with:
|
||||
context: ./canvas
|
||||
file: ./canvas/Dockerfile
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
build-args: |
|
||||
NEXT_PUBLIC_PLATFORM_URL=${{ steps.build_args.outputs.platform_url }}
|
||||
NEXT_PUBLIC_WS_URL=${{ steps.build_args.outputs.ws_url }}
|
||||
tags: |
|
||||
${{ env.IMAGE_NAME }}:latest
|
||||
${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/${{ github.repository }}
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow)
|
||||
@@ -1,207 +0,0 @@
|
||||
name: Railway pin audit (drift detection)
|
||||
|
||||
# Daily audit of Railway env vars for drift-prone image-tag pins —
|
||||
# automation-cadence layer over the detection script + regression test
|
||||
# shipped in PR #2168 (#2001 closure).
|
||||
#
|
||||
# Background: on 2026-04-24 a stale `:staging-a14cf86` SHA pin in CP's
|
||||
# TENANT_IMAGE caused 3+ hours of E2E failure with the appearance that
|
||||
# "every fix didn't propagate" — really the tenant image was so old it
|
||||
# didn't read the env vars those fixes produced. The audit script
|
||||
# (scripts/ops/audit-railway-sha-pins.sh) flags drift; this workflow
|
||||
# runs the same check unattended on a daily cron.
|
||||
#
|
||||
# Cadence: once a day, 13:00 UTC (06:00 PT). Daily is the right
|
||||
# cadence for variables-tier config — Railway env var changes are
|
||||
# deliberate operator actions, low-frequency. Hourly would risk
|
||||
# Railway API rate-limit surprises and is overkill for the change rate.
|
||||
#
|
||||
# Issue-on-failure: drift triggers a priority-high issue, mirroring
|
||||
# .github/workflows/e2e-staging-sanity.yml's pattern. Drift is
|
||||
# medium-priority "config slipped, fix at next ops window," not
|
||||
# active-outage paging.
|
||||
#
|
||||
# Secret hardening: per feedback_schedule_vs_dispatch_secrets_hardening,
|
||||
# the schedule trigger HARD-FAILS on missing RAILWAY_AUDIT_TOKEN
|
||||
# (silent-success on schedule was the failure-mode class that bit the
|
||||
# team before; cron firing without checking anything is worse than no
|
||||
# cron). The workflow_dispatch trigger SOFT-SKIPS on missing secret so
|
||||
# an operator can dry-run the workflow shape during initial provisioning
|
||||
# without tripping a fake red.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 13 * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: railway-pin-audit
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
issues: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
audit:
|
||||
name: Audit Railway env vars for drift-prone pins
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify RAILWAY_AUDIT_TOKEN present
|
||||
# Schedule trigger: hard-fail when the secret is missing —
|
||||
# otherwise the cron silently runs against the wrong scope (or
|
||||
# exits 2 from the script and we issue-spam) without anyone
|
||||
# noticing the token rot.
|
||||
# Dispatch trigger: soft-skip — operator may be dry-running the
|
||||
# workflow shape before provisioning the secret. Logged as a
|
||||
# workflow notice, not a failure.
|
||||
env:
|
||||
RAILWAY_AUDIT_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
id: secret_check
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ -n "${RAILWAY_AUDIT_TOKEN:-}" ]; then
|
||||
echo "have_secret=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
echo "have_secret=false" >> "$GITHUB_OUTPUT"
|
||||
if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
|
||||
echo "::notice::RAILWAY_AUDIT_TOKEN not configured — soft-skipping (manual dispatch)"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::RAILWAY_AUDIT_TOKEN secret missing — schedule trigger requires it. Provision the token (read-only \`variables\` scope on the molecule-platform Railway project) and store as repo secret RAILWAY_AUDIT_TOKEN."
|
||||
exit 1
|
||||
|
||||
- name: Install Railway CLI
|
||||
if: steps.secret_check.outputs.have_secret == 'true'
|
||||
# Pinned hash matching the public install instructions; bump in
|
||||
# tandem with the audit-script's documented Railway CLI version.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
curl -fsSL https://railway.com/install.sh | sh
|
||||
# The installer drops the binary in ~/.railway/bin
|
||||
echo "$HOME/.railway/bin" >> "$GITHUB_PATH"
|
||||
|
||||
- name: Verify Railway CLI authenticated
|
||||
if: steps.secret_check.outputs.have_secret == 'true'
|
||||
env:
|
||||
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# `railway whoami` exits non-zero when the token is
|
||||
# unauthenticated or doesn't have any project access.
|
||||
if ! railway whoami >/dev/null 2>&1; then
|
||||
echo "::error::Railway CLI failed to authenticate with RAILWAY_AUDIT_TOKEN — token may be revoked or scoped incorrectly"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Link molecule-platform project
|
||||
if: steps.secret_check.outputs.have_secret == 'true'
|
||||
env:
|
||||
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
||||
# Project ID from reference_production_stack: molecule-platform
|
||||
# / 7ccc8c68-61f4-42ab-9be5-586eeee11768. Linking is per-process,
|
||||
# so we re-link in this CI shell (the audit script comment says
|
||||
# it deliberately doesn't chdir for you because the linked
|
||||
# project's identity matters).
|
||||
run: |
|
||||
set -euo pipefail
|
||||
railway link --project 7ccc8c68-61f4-42ab-9be5-586eeee11768
|
||||
|
||||
- name: Run drift audit
|
||||
if: steps.secret_check.outputs.have_secret == 'true'
|
||||
id: audit
|
||||
env:
|
||||
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
bash scripts/ops/audit-railway-sha-pins.sh 2>&1 | tee /tmp/audit.log
|
||||
rc=${PIPESTATUS[0]}
|
||||
echo "rc=$rc" >> "$GITHUB_OUTPUT"
|
||||
# Capture the audit log for the issue body.
|
||||
{
|
||||
echo 'log<<AUDIT_EOF'
|
||||
cat /tmp/audit.log
|
||||
echo 'AUDIT_EOF'
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
# Exit codes from the script:
|
||||
# 0 — no drift; workflow goes green
|
||||
# 1 — drift detected; we'll file an issue and fail the run
|
||||
# 2 — railway CLI unauthenticated / project unlinked; fail
|
||||
# Anything else: also fail.
|
||||
case "$rc" in
|
||||
0) exit 0 ;;
|
||||
1) echo "::warning::Drift-prone pin(s) detected — issue will be filed"; exit 1 ;;
|
||||
2) echo "::error::Railway CLI auth/link failed mid-script — token or project ID drift"; exit 2 ;;
|
||||
*) echo "::error::Unexpected audit rc=$rc"; exit 1 ;;
|
||||
esac
|
||||
|
||||
- name: Open / update drift issue
|
||||
if: failure() && steps.audit.outputs.rc == '1'
|
||||
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
||||
env:
|
||||
AUDIT_LOG: ${{ steps.audit.outputs.log }}
|
||||
with:
|
||||
script: |
|
||||
const title = "🚨 Railway env-var drift detected";
|
||||
const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
||||
const body =
|
||||
`Daily Railway pin audit found drift-prone image-tag pins in the molecule-platform Railway project.\n\n` +
|
||||
`**What this means:** an env var (likely on \`controlplane\`) is pinned to a SHA-shaped or semver tag instead of a floating tag. ` +
|
||||
`Same pattern that caused the 2026-04-24 TENANT_IMAGE incident — fix-PRs land but the running service doesn't pick them up.\n\n` +
|
||||
`**Recovery:** open the Railway dashboard, replace the flagged value with a floating tag (\`:staging-latest\`, \`:main\`) unless the pin is intentional and documented in the ops runbook.\n\n` +
|
||||
`**Audit output:**\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\`\n\n` +
|
||||
`Run: ${runURL}\n\n` +
|
||||
`Closes automatically when a subsequent daily run reports clean.`;
|
||||
|
||||
const { data: existing } = await github.rest.issues.listForRepo({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
state: 'open', labels: 'railway-drift',
|
||||
});
|
||||
const match = existing.find(i => i.title === title);
|
||||
if (match) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
issue_number: match.number,
|
||||
body: `Still drifting. ${runURL}\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\``,
|
||||
});
|
||||
} else {
|
||||
await github.rest.issues.create({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
title, body,
|
||||
labels: ['railway-drift', 'bug', 'priority-high'],
|
||||
});
|
||||
}
|
||||
|
||||
- name: Close stale drift issue on clean run
|
||||
# When a previously-flagged drift gets fixed by an operator,
|
||||
# the next daily run goes green. Close any open `railway-drift`
|
||||
# issue with a confirmation comment so the queue doesn't carry
|
||||
# stale ones.
|
||||
if: success() && steps.audit.outputs.rc == '0'
|
||||
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
||||
with:
|
||||
script: |
|
||||
const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
||||
const { data: existing } = await github.rest.issues.listForRepo({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
state: 'open', labels: 'railway-drift',
|
||||
});
|
||||
for (const issue of existing) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
issue_number: issue.number,
|
||||
body: `Daily audit clean — drift resolved. ${runURL}`,
|
||||
});
|
||||
await github.rest.issues.update({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
issue_number: issue.number,
|
||||
state: 'closed',
|
||||
state_reason: 'completed',
|
||||
});
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
name: Runtime Pin Compatibility
|
||||
|
||||
# CI gate that prevents the 5-hour staging outage from 2026-04-24 from
|
||||
# recurring (controlplane#253). The original failure mode:
|
||||
# 1. molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its
|
||||
# requires_dist metadata (incorrect — it actually imports
|
||||
# a2a.server.routes which only exists in a2a-sdk 1.0+)
|
||||
# 2. `pip install molecule-ai-workspace-runtime` resolved cleanly
|
||||
# 3. `from molecule_runtime.main import main_sync` raised ImportError
|
||||
# 4. Every tenant workspace crashed; the canary tenant caught it but
|
||||
# only after 5 hours of degraded staging
|
||||
#
|
||||
# This workflow installs the CURRENTLY PUBLISHED runtime from PyPI on
|
||||
# top of `workspace/requirements.txt` and smoke-imports. Catches:
|
||||
# - Upstream PyPI yanks
|
||||
# - Bad re-releases of molecule-ai-workspace-runtime
|
||||
# - Already-shipped wheels that stop importing because a transitive
|
||||
# dep moved underneath
|
||||
#
|
||||
# This is the "PyPI artifact health" half of pin compatibility. The
|
||||
# companion workflow `runtime-prbuild-compat.yml` covers the
|
||||
# "PR-introduced breakage" half by building the wheel from THIS PR's
|
||||
# workspace/ source. Splitting the two means each gets a narrow
|
||||
# `paths:` filter — the pypi-latest job no longer fires on doc-only
|
||||
# workspace/ edits whose content can't change what's currently on PyPI.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
# Narrow filter: pypi-latest is sensitive only to changes that
|
||||
# affect what we're INSTALLING (requirements.txt) or WHAT THE
|
||||
# CHECK ITSELF DOES (this workflow file). Edits to workspace/
|
||||
# source code don't change what's on PyPI right now, so they
|
||||
# don't change this gate's verdict.
|
||||
- 'workspace/requirements.txt'
|
||||
- '.github/workflows/runtime-pin-compat.yml'
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'workspace/requirements.txt'
|
||||
- '.github/workflows/runtime-pin-compat.yml'
|
||||
# Daily catch for upstream PyPI publishes that break the pin combo
|
||||
# without any change in our repo (e.g. someone re-yanks an a2a-sdk
|
||||
# release or molecule-ai-workspace-runtime publishes a bad bump).
|
||||
schedule:
|
||||
- cron: '0 13 * * *' # 06:00 PT
|
||||
workflow_dispatch:
|
||||
# Required-check support: when this becomes a branch-protection gate,
|
||||
# merge_group runs let the queue green-check this in addition to PRs.
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
pypi-latest-install:
|
||||
name: PyPI-latest install + import smoke
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: pip
|
||||
cache-dependency-path: workspace/requirements.txt
|
||||
- name: Install runtime + workspace requirements
|
||||
# Install order is load-bearing: install the runtime FIRST so pip
|
||||
# honors whatever a2a-sdk constraint the runtime metadata declares
|
||||
# (this is the surface that broke in 2026-04-24 — runtime declared
|
||||
# `a2a-sdk<1.0` but actually needed >=1.0). The follow-up install
|
||||
# of workspace/requirements.txt then upgrades a2a-sdk to the
|
||||
# constraint our runtime image actually pins. The import smoke
|
||||
# below verifies the upgraded combination is consistent.
|
||||
run: |
|
||||
python -m venv /tmp/venv
|
||||
/tmp/venv/bin/pip install --upgrade pip
|
||||
/tmp/venv/bin/pip install molecule-ai-workspace-runtime
|
||||
/tmp/venv/bin/pip install -r workspace/requirements.txt
|
||||
/tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
|
||||
| grep -E '^(Name|Version):'
|
||||
- name: Smoke import — fail if metadata declares deps that don't satisfy real imports
|
||||
# WORKSPACE_ID is validated at import time by platform_auth.py — EC2
|
||||
# user-data sets it from the cloud-init template; set a placeholder
|
||||
# here so the import smoke doesn't trip on the env-var guard.
|
||||
env:
|
||||
WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
|
||||
run: |
|
||||
/tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')"
|
||||
@@ -1,152 +0,0 @@
|
||||
name: Runtime PR-Built Compatibility
|
||||
|
||||
# Companion to `runtime-pin-compat.yml`. That workflow tests what's
|
||||
# CURRENTLY PUBLISHED on PyPI; this workflow tests what WOULD BE
|
||||
# PUBLISHED if THIS PR merges.
|
||||
#
|
||||
# Why two workflows: the chicken-and-egg #128 fix added a "PR-built
|
||||
# wheel" job to the original runtime-pin-compat.yml, but both jobs
|
||||
# shared a `paths:` filter that was the union of their needs
|
||||
# (`workspace/**`). That meant the PyPI-latest job ran on every doc
|
||||
# edit even though the upstream PyPI artifact can't change with our
|
||||
# workspace/ source. Splitting the two means each gets a narrow
|
||||
# `paths:` filter that matches the inputs it actually depends on.
|
||||
#
|
||||
# Catches the failure mode where a PR adds an import requiring a newer
|
||||
# SDK than `workspace/requirements.txt` pins:
|
||||
# 1. Pip resolves the existing PyPI wheel + the old SDK pin → smoke
|
||||
# passes (it imports the OLD main.py from the wheel, not the PR's
|
||||
# new main.py).
|
||||
# 2. Merge → publish-runtime.yml ships a wheel WITH the new import.
|
||||
# 3. Tenant images redeploy → all crash on first boot with
|
||||
# ImportError.
|
||||
#
|
||||
# By building from the PR's source and smoke-importing THAT wheel, we
|
||||
# fail at PR-time instead of after publish.
|
||||
#
|
||||
# Required-check shape (2026-05-01): the workflow runs on EVERY push +
|
||||
# PR + merge_group event with no top-level `paths:` filter, then uses a
|
||||
# detect-changes job + per-step `if:` gates inside ONE always-running
|
||||
# job named `PR-built wheel + import smoke`. PRs that don't touch
|
||||
# wheel-relevant paths get a no-op SUCCESS check run, satisfying branch
|
||||
# protection without re-running the heavy build. Same pattern as
|
||||
# e2e-api.yml — see its comment for the full rationale + the 2026-04-29
|
||||
# PR #2264 incident that motivated the always-run-with-if-gates shape.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
workflow_dispatch:
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
concurrency:
|
||||
# Include event_name so a PR sync (event=pull_request) and the
|
||||
# subsequent staging push (event=push) on the SAME merge SHA don't
|
||||
# collide in one group. Without event_name, both runs hashed to
|
||||
# the same key and cancel-in-progress=true cancelled whichever
|
||||
# arrived second — usually the push run, which staging branch-
|
||||
# protection then sees as a CANCELLED required check and refuses
|
||||
# to mark merged. Caught 2026-05-05 across PR #2869's runs (run
|
||||
# ids 25371863455 / 25371811486 / 25371078157 / 25370403142 — every
|
||||
# staging push run cancelled, every matching PR run green).
|
||||
#
|
||||
# Per memory `feedback_concurrency_group_per_sha.md` — same drift
|
||||
# class that broke auto-promote-staging on 2026-04-28. Pin invariant:
|
||||
# event_name + sha is the minimum unique key for these workflows.
|
||||
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
detect-changes:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
wheel: ${{ steps.decide.outputs.wheel }}
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
wheel:
|
||||
- 'workspace/**'
|
||||
- 'scripts/build_runtime_package.py'
|
||||
- 'scripts/wheel_smoke.py'
|
||||
- '.github/workflows/runtime-prbuild-compat.yml'
|
||||
- id: decide
|
||||
# Always run real work for manual dispatch + merge_group — no
|
||||
# diff-against-base in those contexts, and the gate exists to
|
||||
# validate the to-be-merged state regardless of which paths it
|
||||
# touched (paths-filter would default to "no changes" which is
|
||||
# the wrong answer when the queue is composing many PRs).
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "${{ github.event_name }}" = "merge_group" ]; then
|
||||
echo "wheel=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "wheel=${{ steps.filter.outputs.wheel }}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# ONE job (no job-level `if:`) that always runs and reports under the
|
||||
# required-check name `PR-built wheel + import smoke`. Real work is
|
||||
# gated per-step on `needs.detect-changes.outputs.wheel`. Same shape
|
||||
# as e2e-api.yml's e2e-api job — see its comment block for the full
|
||||
# rationale (SKIPPED check runs block branch protection even with
|
||||
# SUCCESS siblings; collapsing to one always-run job emits exactly
|
||||
# one SUCCESS check run).
|
||||
local-build-install:
|
||||
needs: detect-changes
|
||||
name: PR-built wheel + import smoke
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: No-op pass (paths filter excluded this commit)
|
||||
if: needs.detect-changes.outputs.wheel != 'true'
|
||||
run: |
|
||||
echo "No workspace/ / scripts/{build_runtime_package,wheel_smoke}.py / workflow changes — wheel gate satisfied without rebuilding."
|
||||
echo "::notice::PR-built wheel + import smoke no-op pass (paths filter excluded this commit)."
|
||||
- if: needs.detect-changes.outputs.wheel == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- if: needs.detect-changes.outputs.wheel == 'true'
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: pip
|
||||
cache-dependency-path: workspace/requirements.txt
|
||||
- name: Install build tooling
|
||||
if: needs.detect-changes.outputs.wheel == 'true'
|
||||
run: pip install build
|
||||
- name: Build wheel from PR source (mirrors publish-runtime.yml)
|
||||
if: needs.detect-changes.outputs.wheel == 'true'
|
||||
# Use a fixed test version so the wheel filename is predictable.
|
||||
# Doesn't reach PyPI — this build is local-only for the smoke.
|
||||
# Use the SAME build script with the SAME args as
|
||||
# publish-runtime.yml's build step. The temp dir path differs
|
||||
# (`/tmp/runtime-build` here vs `${{ runner.temp }}/runtime-build`
|
||||
# in publish-runtime.yml — they coincide on ubuntu-latest but
|
||||
# the call sites are not byte-identical). The smoke import is
|
||||
# also intentionally narrower than publish's: this gate exists
|
||||
# to catch SDK-version-import drift specifically; full invariant
|
||||
# coverage lives in publish-runtime.yml's own pre-PyPI smoke.
|
||||
run: |
|
||||
python scripts/build_runtime_package.py \
|
||||
--version "0.0.0.dev0+pin-compat" \
|
||||
--out /tmp/runtime-build
|
||||
cd /tmp/runtime-build && python -m build
|
||||
- name: Install built wheel + workspace requirements
|
||||
if: needs.detect-changes.outputs.wheel == 'true'
|
||||
run: |
|
||||
python -m venv /tmp/venv-built
|
||||
/tmp/venv-built/bin/pip install --upgrade pip
|
||||
/tmp/venv-built/bin/pip install /tmp/runtime-build/dist/*.whl
|
||||
/tmp/venv-built/bin/pip install -r workspace/requirements.txt
|
||||
/tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
|
||||
| grep -E '^(Name|Version):'
|
||||
- name: Smoke import the PR-built wheel
|
||||
if: needs.detect-changes.outputs.wheel == 'true'
|
||||
# Same script publish-runtime.yml runs against the to-be-PyPI wheel.
|
||||
# Closes the PR-time vs publish-time gap: a PR adding a new SDK
|
||||
# call-shape no longer passes here (narrow `import main_sync`) only
|
||||
# to fail post-merge in publish-runtime's broader smoke.
|
||||
run: |
|
||||
/tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
|
||||
@@ -1,58 +0,0 @@
|
||||
name: SECRET_PATTERNS drift lint
|
||||
|
||||
# Detects when the canonical SECRET_PATTERNS array in
|
||||
# .github/workflows/secret-scan.yml diverges from known consumer
|
||||
# mirrors (workspace-runtime's bundled pre-commit hook today; more
|
||||
# can be added as the consumer set grows).
|
||||
#
|
||||
# Why this exists: every side that scans for credentials has its own
|
||||
# copy of the pattern list. They drift — most recently the runtime
|
||||
# hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088),
|
||||
# so a developer's local pre-commit would let a sk-cp- token through
|
||||
# while the org-wide CI scan would refuse it. The cost of that drift
|
||||
# is dev confusion + delayed feedback; the fix is automated detection.
|
||||
#
|
||||
# Triggers:
|
||||
# - schedule: daily 05:00 UTC. Catches drift introduced by edits
|
||||
# to a consumer copy that didn't update canonical here.
|
||||
# - push to main/staging where the canonical or this lint changed:
|
||||
# catches the inverse — canonical updated but consumers not yet
|
||||
# bumped. The lint will fail the push; that's intentional, the
|
||||
# person editing canonical is the right person to also update
|
||||
# the consumer.
|
||||
# - workflow_dispatch: ad-hoc operator runs.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# 05:00 UTC = 22:00 PT / 01:00 ET. Quiet hours so a failure
|
||||
# email lands when humans are starting their day, not
|
||||
# interrupting it.
|
||||
- cron: "0 5 * * *"
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- ".github/workflows/secret-scan.yml"
|
||||
- ".github/workflows/secret-pattern-drift.yml"
|
||||
- ".github/scripts/lint_secret_pattern_drift.py"
|
||||
- ".githooks/pre-commit"
|
||||
workflow_dispatch:
|
||||
|
||||
# GITHUB_TOKEN scoped to read-only. The lint only does git checkout
|
||||
# + HTTPS GETs to public consumer files; no writes to anything.
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Detect SECRET_PATTERNS drift
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Run drift lint
|
||||
run: python3 .github/scripts/lint_secret_pattern_drift.py
|
||||
@@ -1,129 +0,0 @@
|
||||
name: Sweep stale AWS Secrets Manager secrets
|
||||
|
||||
# Janitor for per-tenant AWS Secrets Manager secrets
|
||||
# (`molecule/tenant/<org_id>/bootstrap`) whose backing tenant no
|
||||
# longer exists. Parallel-shape to sweep-cf-tunnels.yml and
|
||||
# sweep-cf-orphans.yml — different cloud, same justification.
|
||||
#
|
||||
# Why this exists separately from a long-term reconciler integration:
|
||||
# - molecule-controlplane's tenant_resources audit table (mig 024)
|
||||
# currently tracks four resource kinds: CloudflareTunnel,
|
||||
# CloudflareDNS, EC2Instance, SecurityGroup. SecretsManager is
|
||||
# not in the list, so the existing reconciler doesn't catch
|
||||
# orphan secrets.
|
||||
# - At ~$0.40/secret/month the cost grew to ~$19/month before this
|
||||
# sweeper was written, indicating ~45+ orphan secrets from
|
||||
# crashed provisions and incomplete deprovision flows.
|
||||
# - The proper fix (KindSecretsManagerSecret + recorder hook +
|
||||
# reconciler enumerator) is filed as a separate controlplane
|
||||
# issue. This sweeper is the immediate cost-relief stopgap.
|
||||
#
|
||||
# IAM principal: AWS_JANITOR_ACCESS_KEY_ID / AWS_JANITOR_SECRET_ACCESS_KEY.
|
||||
# This is a DEDICATED principal — the production `molecule-cp` IAM
|
||||
# user lacks `secretsmanager:ListSecrets` (it only has
|
||||
# Get/Create/Update/Delete on specific resources, scoped to its
|
||||
# operational needs). The janitor needs ListSecrets across the
|
||||
# `molecule/tenant/*` prefix, which warrants a separate principal so
|
||||
# we don't broaden the prod-CP policy.
|
||||
#
|
||||
# Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
|
||||
# sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
|
||||
# the mostly-orphan tunnels) refuses to nuke past the threshold.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Hourly at :30 — offsets from sweep-cf-orphans (:15) and
|
||||
# sweep-cf-tunnels (:45) so the three janitors don't burst the
|
||||
# CP admin endpoints at the same minute.
|
||||
- cron: '30 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry run only — list what would be deleted, no deletion"
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
max_delete_pct:
|
||||
description: "Override safety gate (default 50, set higher only for major cleanup)"
|
||||
required: false
|
||||
default: "50"
|
||||
grace_hours:
|
||||
description: "Skip secrets created within this many hours (default 24)"
|
||||
required: false
|
||||
default: "24"
|
||||
|
||||
# Don't let two sweeps race the same AWS account.
|
||||
concurrency:
|
||||
group: sweep-aws-secrets
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sweep:
|
||||
name: Sweep AWS Secrets Manager
|
||||
runs-on: ubuntu-latest
|
||||
# 30 min cap, mirroring the other janitors. AWS DeleteSecret is
|
||||
# fast (~0.3s/call) so even a 100+ backlog drains in seconds
|
||||
# under the 8-way xargs parallelism, but the cap is set generously
|
||||
# to leave headroom for any actual API hang.
|
||||
timeout-minutes: 30
|
||||
env:
|
||||
AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_JANITOR_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_JANITOR_SECRET_ACCESS_KEY }}
|
||||
CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
|
||||
CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
|
||||
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
|
||||
GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify required secrets present
|
||||
id: verify
|
||||
# Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
|
||||
# and sweep-cf-tunnels (hardened 2026-04-28). Same principle:
|
||||
# - schedule → exit 1 on missing secrets (red CI surfaces it)
|
||||
# - workflow_dispatch → exit 0 with warning (operator-driven,
|
||||
# they already accepted the repo state)
|
||||
run: |
|
||||
missing=()
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
missing+=("$var")
|
||||
fi
|
||||
done
|
||||
if [ ${#missing[@]} -gt 0 ]; then
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
|
||||
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
|
||||
echo "::warning::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/* (the prod molecule-cp principal lacks ListSecrets)."
|
||||
echo "skip=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
|
||||
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
|
||||
echo "::error::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/*."
|
||||
exit 1
|
||||
fi
|
||||
echo "All required secrets present ✓"
|
||||
echo "skip=false" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Run sweep
|
||||
if: steps.verify.outputs.skip != 'true'
|
||||
# Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-tunnels:
|
||||
# - Scheduled: input empty → "false" → --execute (the whole
|
||||
# point of an hourly janitor).
|
||||
# - Manual workflow_dispatch: input default true → dry-run;
|
||||
# operator must flip it to actually delete.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
|
||||
echo "Running in dry-run mode — no deletions"
|
||||
bash scripts/ops/sweep-aws-secrets.sh
|
||||
else
|
||||
echo "Running with --execute — will delete identified orphans"
|
||||
bash scripts/ops/sweep-aws-secrets.sh --execute
|
||||
fi
|
||||
@@ -1,146 +0,0 @@
|
||||
name: Sweep stale Cloudflare DNS records
|
||||
|
||||
# Janitor for Cloudflare DNS records whose backing tenant/workspace no
|
||||
# longer exists. Without this loop, every short-lived E2E or canary
|
||||
# leaves a CF record on the moleculesai.app zone — the zone has a
|
||||
# 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
|
||||
# start failing with code 81045 once exhausted.
|
||||
#
|
||||
# Why a separate workflow vs sweep-stale-e2e-orgs.yml:
|
||||
# - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
|
||||
# drives the cascade). It assumes CP has the org row to drive the
|
||||
# deprovision from. It doesn't catch records left behind when CP
|
||||
# itself never knew about the tenant (canary scratch, manual ops
|
||||
# experiments) or when the cascade's CF-delete branch failed.
|
||||
# - sweep-cf-orphans.sh enumerates the CF zone directly and matches
|
||||
# each record against live CP slugs + AWS EC2 names. It catches
|
||||
# leaks the CP-driven sweep can't.
|
||||
#
|
||||
# Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
|
||||
# than 50% of records in a single run. If something has gone weird
|
||||
# (CP admin endpoint returns no orgs → every tenant looks orphan) the
|
||||
# gate halts before damage. Decision-function unit tests in
|
||||
# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
|
||||
# classifier.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
|
||||
# converge on the same tick. CF API rate budget is generous (1200
|
||||
# req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
|
||||
- cron: '15 * * * *' # offset from sweep-stale-e2e-orgs (top of hour)
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry run only — list what would be deleted, no deletion"
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
max_delete_pct:
|
||||
description: "Override safety gate (default 50, set higher only for major cleanup)"
|
||||
required: false
|
||||
default: "50"
|
||||
# No `merge_group:` trigger on purpose. This is a janitor — it doesn't
|
||||
# need to gate merges, and including it as written before #2088 fired
|
||||
# the full sweep job (or its secret-check) on every PR going through
|
||||
# the merge queue, generating one red CI run per merge-queue eval. If
|
||||
# this workflow is ever wired up as a required check, re-add
|
||||
# merge_group: { types: [checks_requested] }
|
||||
# AND gate the sweep step with `if: github.event_name != 'merge_group'`
|
||||
# so merge-queue evals report success without actually running.
|
||||
|
||||
# Don't let two sweeps race the same zone. workflow_dispatch during a
|
||||
# scheduled run would otherwise issue duplicate DELETE calls.
|
||||
concurrency:
|
||||
group: sweep-cf-orphans
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sweep:
|
||||
name: Sweep CF orphans
|
||||
runs-on: ubuntu-latest
|
||||
# 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
|
||||
# within one cron interval instead of burning a full tick. Realistic
|
||||
# worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
|
||||
# each individually capped at 10s by the script's curl -m flag.
|
||||
timeout-minutes: 3
|
||||
env:
|
||||
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
|
||||
CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
|
||||
CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
|
||||
CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify required secrets present
|
||||
id: verify
|
||||
# Schedule-vs-dispatch behaviour split (hardened 2026-04-28
|
||||
# after the silent-no-op incident below):
|
||||
#
|
||||
# The earlier soft-skip-on-schedule policy hid a real leak. All
|
||||
# six secrets were unset on this repo for an unknown duration;
|
||||
# every hourly run printed a yellow ::warning:: and exited 0,
|
||||
# so the workflow registered as "passing" while doing nothing.
|
||||
# CF orphans accumulated to 152/200 (~76% of the zone quota
|
||||
# gone) before a manual `dig`-driven audit caught it. Anything
|
||||
# that runs as a janitor and reports green while idle is
|
||||
# indistinguishable from "the janitor is healthy" — so we now
|
||||
# treat schedule (and any future workflow_run/push triggers)
|
||||
# as a hard-fail when secrets are missing.
|
||||
#
|
||||
# - schedule / workflow_run / push → exit 1 (red CI run
|
||||
# surfaces the misconfiguration the next tick)
|
||||
# - workflow_dispatch → exit 0 with a warning
|
||||
# (an operator ran this ad-hoc; they already accepted the
|
||||
# state of the repo and want the workflow to short-circuit
|
||||
# so they can rerun after fixing the secret)
|
||||
run: |
|
||||
missing=()
|
||||
for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
missing+=("$var")
|
||||
fi
|
||||
done
|
||||
if [ ${#missing[@]} -gt 0 ]; then
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
|
||||
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
|
||||
echo "skip=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
|
||||
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
|
||||
echo "::error::a silent skip masked an active CF DNS leak (152/200 zone records) caught only by a manual audit on 2026-04-28; this gate exists to make the gap visible."
|
||||
exit 1
|
||||
fi
|
||||
echo "All required secrets present ✓"
|
||||
echo "skip=false" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Run sweep
|
||||
if: steps.verify.outputs.skip != 'true'
|
||||
# Schedule-vs-dispatch dry-run asymmetry (intentional):
|
||||
# - Scheduled runs: github.event.inputs.dry_run is empty →
|
||||
# defaults to "false" below → script runs with --execute
|
||||
# (the whole point of an hourly janitor).
|
||||
# - Manual workflow_dispatch: input default is true (line 38)
|
||||
# so an ad-hoc operator-triggered run is dry-run by default;
|
||||
# they have to flip the toggle to actually delete.
|
||||
# The script's MAX_DELETE_PCT gate (default 50%) is the second
|
||||
# line of defense regardless of mode.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
|
||||
echo "Running in dry-run mode — no deletions"
|
||||
bash scripts/ops/sweep-cf-orphans.sh
|
||||
else
|
||||
echo "Running with --execute — will delete identified orphans"
|
||||
bash scripts/ops/sweep-cf-orphans.sh --execute
|
||||
fi
|
||||
@@ -1,124 +0,0 @@
|
||||
name: Sweep stale Cloudflare Tunnels
|
||||
|
||||
# Janitor for Cloudflare Tunnels whose backing tenant no longer
|
||||
# exists. Parallel-shape to sweep-cf-orphans.yml (which sweeps DNS
|
||||
# records); same justification, different CF resource.
|
||||
#
|
||||
# Why this exists separately from sweep-cf-orphans:
|
||||
# - DNS records live on the zone (`/zones/<id>/dns_records`).
|
||||
# - Tunnels live on the account (`/accounts/<id>/cfd_tunnel`).
|
||||
# - Different CF API surface, different scopes; the existing CF
|
||||
# token might not have `account:cloudflare_tunnel:edit`. Splitting
|
||||
# the workflows keeps each one's secret-presence gate independent
|
||||
# so neither silent-skips when the other's secret is missing.
|
||||
# - Cleaner blast radius — operators can disable one without the
|
||||
# other if a regression surfaces.
|
||||
#
|
||||
# Safety: the script's MAX_DELETE_PCT gate (default 90% — higher than
|
||||
# the DNS sweep's 50% because tenant-shaped tunnels are mostly
|
||||
# orphans by design) refuses to nuke past the threshold.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Hourly at :45 — offset from sweep-cf-orphans (:15) so the two
|
||||
# janitors don't issue parallel CF API bursts at the same minute.
|
||||
- cron: '45 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry run only — list what would be deleted, no deletion"
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
max_delete_pct:
|
||||
description: "Override safety gate (default 90, set higher only for major cleanup)"
|
||||
required: false
|
||||
default: "90"
|
||||
|
||||
# Don't let two sweeps race the same account.
|
||||
concurrency:
|
||||
group: sweep-cf-tunnels
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sweep:
|
||||
name: Sweep CF tunnels
|
||||
runs-on: ubuntu-latest
|
||||
# 30 min cap. Was 5 min on the theory that the only thing that
|
||||
# could take >5min is a CF-API hang — but on 2026-05-02 a backlog
|
||||
# of 672 stale tunnels accumulated (large staging E2E run + delayed
|
||||
# sweep) and the serial `curl -X DELETE` loop (~0.7s/tunnel) needed
|
||||
# ~7-8min to drain. The 5-min cap killed the run mid-sweep
|
||||
# (cancelled at 424/672, see run 25248788312); a manual rerun
|
||||
# finished the remainder fine.
|
||||
#
|
||||
# The fix is two-part: parallelize the delete loop (8-way xargs in
|
||||
# the script — see scripts/ops/sweep-cf-tunnels.sh), AND raise the
|
||||
# cap so a one-off backlog doesn't trip a hangs-detector that
|
||||
# turned out to be a real-job-too-slow detector. With 8-way
|
||||
# parallelism, 600+ tunnels drains in ~60s; 30 min is generous
|
||||
# headroom for actual hangs to still surface (and is in line with
|
||||
# the sweep-cf-orphans companion job).
|
||||
timeout-minutes: 30
|
||||
env:
|
||||
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
|
||||
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
|
||||
CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
|
||||
CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
|
||||
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify required secrets present
|
||||
id: verify
|
||||
# Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
|
||||
# (hardened 2026-04-28 after the silent-no-op incident: the
|
||||
# janitor reported green while doing nothing because secrets
|
||||
# were unset, masking a 152/200 zone-record leak). Same
|
||||
# principle applies here:
|
||||
# - schedule → exit 1 on missing secrets (red CI surfaces it)
|
||||
# - workflow_dispatch → exit 0 with warning (operator-driven,
|
||||
# they already accepted the repo state)
|
||||
run: |
|
||||
missing=()
|
||||
for var in CF_API_TOKEN CF_ACCOUNT_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
missing+=("$var")
|
||||
fi
|
||||
done
|
||||
if [ ${#missing[@]} -gt 0 ]; then
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
|
||||
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
|
||||
echo "::warning::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from the zone:dns:edit scope used by sweep-cf-orphans)."
|
||||
echo "skip=true" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
|
||||
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
|
||||
echo "::error::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope."
|
||||
exit 1
|
||||
fi
|
||||
echo "All required secrets present ✓"
|
||||
echo "skip=false" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Run sweep
|
||||
if: steps.verify.outputs.skip != 'true'
|
||||
# Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-orphans:
|
||||
# - Scheduled: input empty → "false" → --execute (the whole
|
||||
# point of an hourly janitor).
|
||||
# - Manual workflow_dispatch: input default true → dry-run;
|
||||
# operator must flip it to actually delete.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
|
||||
echo "Running in dry-run mode — no deletions"
|
||||
bash scripts/ops/sweep-cf-tunnels.sh
|
||||
else
|
||||
echo "Running with --execute — will delete identified orphans"
|
||||
bash scripts/ops/sweep-cf-tunnels.sh --execute
|
||||
fi
|
||||
@@ -1,239 +0,0 @@
|
||||
name: Sweep stale e2e-* orgs (staging)
|
||||
|
||||
# Janitor for staging tenants left behind when E2E cleanup didn't run:
|
||||
# CI cancellations, runner crashes, transient AWS errors mid-cascade,
|
||||
# bash trap missed (signal 9), etc. Without this loop, every failed
|
||||
# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
|
||||
# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
|
||||
#
|
||||
# Why not rely on per-test-run teardown:
|
||||
# - Per-run teardown is best-effort by definition. Any process death
|
||||
# after the test starts but before the trap fires leaves debris.
|
||||
# - GH Actions cancellation kills the runner without grace period.
|
||||
# The workflow's `if: always()` step usually catches this, but it
|
||||
# too can fail (CP transient 5xx, runner network issue at the
|
||||
# wrong moment).
|
||||
# - Even when teardown runs, the CP cascade is best-effort in places
|
||||
# (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
|
||||
# - This sweep is the catch-all that converges staging back to clean
|
||||
# regardless of which specific path leaked.
|
||||
#
|
||||
# The PROPER fix is making CP cleanup transactional + verify-after-
|
||||
# terminate (filed separately as cleanup-correctness work). This
|
||||
# workflow is the safety net that catches everything else AND any
|
||||
# future leak source we haven't yet identified.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from
|
||||
# create to teardown — canary is ~8 min, full SaaS ~25 min). The
|
||||
# previous hourly + 120-min stale threshold meant a leaked tenant
|
||||
# could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per
|
||||
# leak. Tightening the cadence + threshold reduces the worst-case
|
||||
# leak window from 120 min to ~45 min (15-min sweep cadence + 30-min
|
||||
# threshold) without risk of catching in-progress runs (the longest
|
||||
# e2e run is the 25-min canary, well under the 30-min threshold).
|
||||
# See molecule-controlplane#420 for the leak-class accounting that
|
||||
# motivated this tightening.
|
||||
- cron: '*/15 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
max_age_minutes:
|
||||
description: "Delete e2e-* orgs older than N minutes (default 30)"
|
||||
required: false
|
||||
default: "30"
|
||||
dry_run:
|
||||
description: "Dry run only — list what would be deleted"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
|
||||
# on a manual trigger; queue rather than parallel-delete.
|
||||
concurrency:
|
||||
group: sweep-stale-e2e-orgs
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sweep:
|
||||
name: Sweep e2e orgs
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }}
|
||||
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
|
||||
# Refuse to delete more than this many orgs in one tick. If the
|
||||
# CP DB is briefly empty (or the admin endpoint goes weird and
|
||||
# returns no created_at), every e2e- org would look stale.
|
||||
# Bailing protects against runaway nukes.
|
||||
SAFETY_CAP: 50
|
||||
|
||||
steps:
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$ADMIN_TOKEN" ]; then
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
|
||||
exit 2
|
||||
fi
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: Identify stale e2e orgs
|
||||
id: identify
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Fetch into a file so the python step reads it via stdin —
|
||||
# cleaner than embedding $(curl ...) into a heredoc.
|
||||
curl -sS --fail-with-body --max-time 30 \
|
||||
"$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
> orgs.json
|
||||
|
||||
# Filter:
|
||||
# 1. slug starts with one of the ephemeral test prefixes:
|
||||
# - 'e2e-' — covers e2e-canary-, e2e-canvas-*, etc.
|
||||
# - 'rt-e2e-' — runtime-test harness fixtures (RFC #2251);
|
||||
# missing this prefix left two such tenants
|
||||
# orphaned 8h on staging (2026-05-03), then
|
||||
# hard-failed redeploy-tenants-on-staging
|
||||
# and broke the staging→main auto-promote
|
||||
# chain. Kept in sync with the EPHEMERAL_PREFIX_RE
|
||||
# regex in redeploy-tenants-on-staging.yml.
|
||||
# 2. created_at is older than MAX_AGE_MINUTES ago
|
||||
# Output one slug per line to a file the next step reads.
|
||||
python3 > stale_slugs.txt <<'PY'
|
||||
import json, os
|
||||
from datetime import datetime, timezone, timedelta
|
||||
# SSOT for this list lives in the controlplane Go code:
|
||||
# molecule-controlplane/internal/slugs/ephemeral.go
|
||||
# (var EphemeralPrefixes). The redeploy-fleet auto-rollout
|
||||
# also reads from there to SKIP these slugs — without that
|
||||
# filter, fleet redeploy SSM-failed in-flight E2E tenants
|
||||
# whose containers were still booting, breaking the test
|
||||
# that just spun them up (molecule-controlplane#493).
|
||||
# Update both files together.
|
||||
EPHEMERAL_PREFIXES = ("e2e-", "rt-e2e-")
|
||||
with open("orgs.json") as f:
|
||||
data = json.load(f)
|
||||
max_age = int(os.environ["MAX_AGE_MINUTES"])
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
|
||||
for o in data.get("orgs", []):
|
||||
slug = o.get("slug", "")
|
||||
if not slug.startswith(EPHEMERAL_PREFIXES):
|
||||
continue
|
||||
created = o.get("created_at")
|
||||
if not created:
|
||||
# Defensively skip rows without created_at — better
|
||||
# to leave one orphan than nuke a brand-new row
|
||||
# whose timestamp didn't render.
|
||||
continue
|
||||
# Python 3.11+ handles RFC3339 with Z directly via
|
||||
# fromisoformat; older runners need the trailing Z swap.
|
||||
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
||||
if created_dt < cutoff:
|
||||
print(slug)
|
||||
PY
|
||||
|
||||
count=$(wc -l < stale_slugs.txt | tr -d ' ')
|
||||
echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
|
||||
if [ "$count" -gt 0 ]; then
|
||||
echo "First 20:"
|
||||
head -20 stale_slugs.txt | sed 's/^/ /'
|
||||
fi
|
||||
echo "count=$count" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Safety gate
|
||||
if: steps.identify.outputs.count != '0'
|
||||
run: |
|
||||
count="${{ steps.identify.outputs.count }}"
|
||||
if [ "$count" -gt "$SAFETY_CAP" ]; then
|
||||
echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
|
||||
exit 1
|
||||
fi
|
||||
echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
|
||||
|
||||
- name: Delete stale orgs
|
||||
if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
|
||||
run: |
|
||||
set -uo pipefail
|
||||
deleted=0
|
||||
failed=0
|
||||
while IFS= read -r slug; do
|
||||
[ -z "$slug" ] && continue
|
||||
# The DELETE handler requires {"confirm": "<slug>"} matching
|
||||
# the URL slug — fat-finger guard. Idempotent: re-issuing
|
||||
# picks up via org_purges.last_step.
|
||||
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
|
||||
# pollution of the captured status (lint-curl-status-capture.yml).
|
||||
set +e
|
||||
curl -sS -o /tmp/del_resp -w "%{http_code}" \
|
||||
--max-time 60 \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/del_code
|
||||
set -e
|
||||
# Stderr from curl (-sS shows dial errors etc.) goes to runner log.
|
||||
http_code=$(cat /tmp/del_code 2>/dev/null || echo "000")
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
|
||||
deleted=$((deleted+1))
|
||||
echo " deleted: $slug"
|
||||
else
|
||||
failed=$((failed+1))
|
||||
echo " FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
|
||||
fi
|
||||
done < stale_slugs.txt
|
||||
echo ""
|
||||
echo "Sweep summary: deleted=$deleted failed=$failed"
|
||||
# Don't fail the workflow on per-org delete errors — the
|
||||
# sweeper is best-effort. Next hourly tick re-attempts. We
|
||||
# only fail loud at the safety-cap gate above.
|
||||
|
||||
- name: Sweep orphan tunnels
|
||||
# Stale-org cleanup deletes the org (which cascades to tunnel
|
||||
# delete inside the CP). But when that cascade fails partway —
|
||||
# CP transient 5xx after the org row is deleted but before the
|
||||
# CF tunnel delete completes — the tunnel persists with no
|
||||
# matching org row. The reconciler in internal/sweep flags this
|
||||
# as `cf_tunnel kind=orphan`, but nothing automatically reaps it.
|
||||
#
|
||||
# `/cp/admin/orphan-tunnels/cleanup` is the operator-triggered
|
||||
# reaper. Calling it here at the end of every sweep tick
|
||||
# converges the staging CF account to clean even when CP
|
||||
# cascades half-fail.
|
||||
#
|
||||
# PR #492 made the underlying DeleteTunnel actually check
|
||||
# status — pre-fix it silent-succeeded on CF code 1022
|
||||
# ("active connections"), so this step would have been a no-op
|
||||
# against stuck connectors. Post-fix the cleanup invokes
|
||||
# CleanupTunnelConnections + retry, which actually clears the
|
||||
# 1022 case. (#2987)
|
||||
#
|
||||
# Best-effort. Failure here doesn't fail the workflow — next
|
||||
# tick re-attempts. Errors flow to step output for ops review.
|
||||
if: env.DRY_RUN != 'true'
|
||||
run: |
|
||||
set +e
|
||||
curl -sS -o /tmp/cleanup_resp -w "%{http_code}" \
|
||||
--max-time 60 \
|
||||
-X POST "$MOLECULE_CP_URL/cp/admin/orphan-tunnels/cleanup" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" >/tmp/cleanup_code
|
||||
set -e
|
||||
http_code=$(cat /tmp/cleanup_code 2>/dev/null || echo "000")
|
||||
body=$(cat /tmp/cleanup_resp 2>/dev/null | head -c 500)
|
||||
if [ "$http_code" = "200" ]; then
|
||||
count=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(d.get('deleted_count', 0))" 2>/dev/null || echo "0")
|
||||
failed_n=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(len(d.get('failed') or {}))" 2>/dev/null || echo "0")
|
||||
echo "Orphan-tunnel sweep: deleted=$count failed=$failed_n"
|
||||
else
|
||||
echo "::warning::orphan-tunnels cleanup returned HTTP $http_code — body: $body"
|
||||
fi
|
||||
|
||||
- name: Dry-run summary
|
||||
if: env.DRY_RUN == 'true'
|
||||
run: |
|
||||
echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete."
|
||||
@@ -1,52 +0,0 @@
|
||||
name: Ops Scripts Tests
|
||||
|
||||
# Runs the unittest suite for scripts/ on every PR + push that touches
|
||||
# anything under scripts/. Kept separate from the main CI so a script-only
|
||||
# change doesn't trigger the heavier Go/Canvas/Python pipelines.
|
||||
#
|
||||
# Discovery layout: tests sit alongside the code they test (see
|
||||
# scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/
|
||||
# test_build_runtime_package.py for the rewriter coverage). The job
|
||||
# below runs `unittest discover` TWICE — once from `scripts/`, once
|
||||
# from `scripts/ops/` — because neither dir has an `__init__.py`, so
|
||||
# a single discover from `scripts/` doesn't recurse into the ops
|
||||
# subdir. Two passes is simpler than retrofitting namespace packages.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'scripts/**'
|
||||
- '.github/workflows/test-ops-scripts.yml'
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'scripts/**'
|
||||
- '.github/workflows/test-ops-scripts.yml'
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: Ops scripts (unittest)
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: '3.11'
|
||||
- name: Run scripts/ unittests (build_runtime_package, …)
|
||||
# Top-level scripts/ tests live alongside their target file
|
||||
# (e.g. scripts/test_build_runtime_package.py exercises
|
||||
# scripts/build_runtime_package.py). discover from scripts/
|
||||
# picks up only top-level test_*.py because scripts/ops/ has
|
||||
# no __init__.py — that's intentional, so we run two passes.
|
||||
working-directory: scripts
|
||||
run: python -m unittest discover -t . -p 'test_*.py' -v
|
||||
- name: Run scripts/ops/ unittests (sweep_cf_decide, …)
|
||||
working-directory: scripts/ops
|
||||
run: python -m unittest discover -p 'test_*.py' -v
|
||||
@@ -68,14 +68,103 @@ export function Toolbar() {
|
||||
return c;
|
||||
}, [nodes]);
|
||||
|
||||
/**
|
||||
* Stop All - task #377 fix.
|
||||
*
|
||||
* BEFORE this PR: directly POSTed `/workspaces/:id/restart`, which tears
|
||||
* the container down and back up. That kills in-flight tool subprocesses
|
||||
* (e.g. `bash -c 'sleep 600'`) but is heavy and discards any in-progress
|
||||
* agent state. It also bypasses the runtime-side fast cancel path (task
|
||||
* #377 PR#40 in template-claude-code) - meaning flipping
|
||||
* `MOLECULE_STOP_PROPAGATE=true` would produce zero canary signal because
|
||||
* nothing ever invokes `executor.cancel()` in production.
|
||||
*
|
||||
* AFTER this PR (two-phase polite cancel):
|
||||
*
|
||||
* 1. POST `tasks/cancel` (A2A JSON-RPC) to each active workspace's
|
||||
* `/workspaces/:id/a2a` proxy. The platform proxies the envelope to
|
||||
* the workspace runtime; the a2a-sdk framework dispatches `tasks/cancel`
|
||||
* to `AgentExecutor.cancel()` (a2a-sdk 1.0.3
|
||||
* `a2a/compat/v0_3/types.py` line 1125 pins the wire literal as
|
||||
* `Literal["tasks/cancel"]`; A2A protocol spec section 9.4.5 maps the
|
||||
* abstract `CancelTask` operation to that wire string). The runtime's
|
||||
* executor cancel path signals the CLI subprocess group with
|
||||
* SIGTERM/grace/SIGKILL (template-claude-code PR#40 `stop_propagate.py`).
|
||||
*
|
||||
* 2. Poll the canvas store (the platform pushes `TASK_UPDATED` over WS
|
||||
* on `active_tasks` changes - `canvas-events.ts` line 400) for up to
|
||||
* `STOP_ALL_DRAIN_TIMEOUT_MS`. A workspace whose `activeTasks` drops
|
||||
* to 0 is considered drained and is NOT restarted.
|
||||
*
|
||||
* 3. For any workspace that DID NOT drain inside the timeout - runtime
|
||||
* is on an old image without the cancel path, or the cancel
|
||||
* propagation is stuck - fall back to the original heavy
|
||||
* `/workspaces/:id/restart`. The original behavior is preserved as a
|
||||
* floor so a stuck workspace still gets stopped; the polite path is
|
||||
* a fast top-up that lets well-behaved workspaces cancel without
|
||||
* losing context.
|
||||
*
|
||||
* The polite-cancel envelope mirrors `ScheduleTab.handleRunNow` (line 168)
|
||||
* which is the only other place in canvas that POSTs `/workspaces/:id/a2a`
|
||||
* directly. Method string `tasks/cancel` and empty `params` match the
|
||||
* a2a-sdk shape verified above. The proxy adds `jsonrpc:"2.0"` and `id`
|
||||
* via `normalizeA2APayload` server-side, so the canvas envelope omits them.
|
||||
*/
|
||||
const stopAll = useCallback(async () => {
|
||||
setStopping(true);
|
||||
const active = nodes.filter((n) => (n.data.activeTasks as number) > 0);
|
||||
const activeIds = active.map((n) => n.id);
|
||||
|
||||
// Phase 1 - polite cancel on every active workspace in parallel.
|
||||
// Errors are swallowed (same shape as the pre-fix /restart
|
||||
// Promise.all): a 4xx/5xx on tasks/cancel just means we fall through
|
||||
// to /restart for that workspace below.
|
||||
await Promise.all(
|
||||
active.map((n) =>
|
||||
api.post(`/workspaces/${n.id}/restart`).catch(() => {})
|
||||
activeIds.map((id) =>
|
||||
api
|
||||
.post(`/workspaces/${id}/a2a`, {
|
||||
method: "tasks/cancel",
|
||||
params: {},
|
||||
})
|
||||
.catch(() => {})
|
||||
)
|
||||
);
|
||||
|
||||
// Phase 2 - poll the store for activeTasks reaching 0, with a hard
|
||||
// timeout. STOP_ALL_DRAIN_TIMEOUT_MS is sized to cover the runtime's
|
||||
// own SIGTERM-grace (5s in template-claude-code stop_propagate.py
|
||||
// `_SIGTERM_GRACE_S`) plus a small WS round-trip buffer for the
|
||||
// TASK_UPDATED push. STOP_ALL_POLL_INTERVAL_MS keeps the poll cheap
|
||||
// (no animation jitter, no busy-wait).
|
||||
const STOP_ALL_DRAIN_TIMEOUT_MS = 8000;
|
||||
const STOP_ALL_POLL_INTERVAL_MS = 250;
|
||||
const deadline = Date.now() + STOP_ALL_DRAIN_TIMEOUT_MS;
|
||||
let undrained = new Set(activeIds);
|
||||
while (undrained.size > 0 && Date.now() < deadline) {
|
||||
await new Promise((r) => setTimeout(r, STOP_ALL_POLL_INTERVAL_MS));
|
||||
const fresh = useCanvasStore.getState().nodes;
|
||||
const stillActive = new Set<string>();
|
||||
for (const id of undrained) {
|
||||
const n = fresh.find((x) => x.id === id);
|
||||
// Missing node (workspace deleted mid-cancel) is treated as
|
||||
// drained - there's nothing left to restart and reporting it as
|
||||
// "still running" would be a lie.
|
||||
if (n && (n.data.activeTasks as number) > 0) stillActive.add(id);
|
||||
}
|
||||
undrained = stillActive;
|
||||
}
|
||||
|
||||
// Phase 3 - hard-restart anything that did not drain. This is the
|
||||
// same call shape as the pre-fix Stop All, so behavior is strictly a
|
||||
// superset: undrained workspaces still get the heavy stop, drained
|
||||
// ones are spared.
|
||||
if (undrained.size > 0) {
|
||||
await Promise.all(
|
||||
Array.from(undrained).map((id) =>
|
||||
api.post(`/workspaces/${id}/restart`).catch(() => {})
|
||||
)
|
||||
);
|
||||
}
|
||||
setStopping(false);
|
||||
}, [nodes]);
|
||||
|
||||
|
||||
@@ -131,14 +131,30 @@ const defaultStore = {
|
||||
batchDelete: vi.fn(() => Promise.resolve()),
|
||||
};
|
||||
|
||||
vi.mock("@/store/canvas", () => ({
|
||||
useCanvasStore: vi.fn((selector: (s: typeof defaultStore) => unknown) =>
|
||||
vi.mock("@/store/canvas", () => {
|
||||
// useCanvasStore is used in two shapes:
|
||||
// 1. As a hook: `useCanvasStore((s) => s.x)` — selector path.
|
||||
// 2. As a static accessor: `useCanvasStore.getState().nodes` —
|
||||
// used by stopAll's drain-poll loop (task #377 Toolbar fix) and
|
||||
// restartAll's success-clear loop. Both read the LIVE
|
||||
// defaultStore object so tests that mutate `defaultStore.nodes`
|
||||
// mid-flight (e.g. simulating a TASK_UPDATED that drops
|
||||
// activeTasks to 0) see the update on the next poll tick.
|
||||
const hook = vi.fn((selector: (s: typeof defaultStore) => unknown) =>
|
||||
selector(defaultStore)
|
||||
),
|
||||
}));
|
||||
) as unknown as ((selector: (s: typeof defaultStore) => unknown) => unknown) & {
|
||||
getState: () => typeof defaultStore;
|
||||
};
|
||||
hook.getState = () => defaultStore;
|
||||
return { useCanvasStore: hook };
|
||||
});
|
||||
|
||||
// ── Component under test ───────────────────────────────────────────────────────
|
||||
import { Toolbar } from "../Toolbar";
|
||||
// Imported AFTER vi.mock("@/lib/api", ...) above (hoisted) so this
|
||||
// resolves to the mock module; gives the new task #377 tests a typed
|
||||
// handle on api.post without a CJS require() (Vitest runs ESM).
|
||||
import { api as mockedApi } from "@/lib/api";
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -315,3 +331,157 @@ describe("Toolbar — ? shortcut opens shortcuts dialog", () => {
|
||||
expect(screen.queryByTestId("shortcuts-dialog")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
// ── Toolbar — Stop All polite-cancel flow (task #377) ───────────────────────
|
||||
|
||||
describe("Toolbar — Stop All polite cancel before restart (#377)", () => {
|
||||
// `api` resolves to the top-level vi.mock factory's mocked `post`.
|
||||
// We type-cast so TS allows mockReset/mockResolvedValue/mockImplementation
|
||||
// calls without leaking the mock surface into the production type.
|
||||
const api = mockedApi as unknown as { post: ReturnType<typeof vi.fn> };
|
||||
|
||||
/**
|
||||
* Build a working set of two active workspaces so the assertions can
|
||||
* distinguish per-id behavior (drained vs undrained) within one test.
|
||||
*/
|
||||
const seedTwoActive = () => {
|
||||
defaultStore.nodes = toStoreNodes(makeNodes(["online", "online"], [2, 2]));
|
||||
};
|
||||
|
||||
/**
|
||||
* Drive an async useCallback handler to completion. Vitest's fake
|
||||
* timers don't see microtasks unless we yield between advances; the
|
||||
* helper interleaves `vi.advanceTimersByTimeAsync` with macrotask
|
||||
* yields so pending fetch resolutions and setTimeout callbacks both
|
||||
* settle before the assertion runs.
|
||||
*/
|
||||
const advanceUntilSettled = async (ms: number) => {
|
||||
await vi.advanceTimersByTimeAsync(ms);
|
||||
// One extra tick lets any chained .then() after a setTimeout
|
||||
// resolution fire before the test moves on.
|
||||
await Promise.resolve();
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
api.post.mockReset();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("phase 1: issues tasks/cancel via /workspaces/:id/a2a BEFORE any /restart", async () => {
|
||||
seedTwoActive();
|
||||
// Hold both tasks/cancel responses open so the click handler is
|
||||
// observably paused at phase 1. We don't actually need to resolve
|
||||
// them for the order assertion — just inspect the call log.
|
||||
let resolveCancels!: () => void;
|
||||
const cancelGate = new Promise<void>((r) => { resolveCancels = r; });
|
||||
api.post.mockImplementation(async (path: string) => {
|
||||
if (path.endsWith("/a2a")) {
|
||||
await cancelGate;
|
||||
}
|
||||
return undefined;
|
||||
});
|
||||
|
||||
render(<Toolbar />);
|
||||
const btn = screen.getByRole("button", { name: /stop all running tasks/i });
|
||||
fireEvent.click(btn);
|
||||
|
||||
// Yield once so the click handler enters phase 1 and dispatches the
|
||||
// two /a2a POSTs.
|
||||
await Promise.resolve();
|
||||
await Promise.resolve();
|
||||
|
||||
const a2aCalls = api.post.mock.calls.filter((c) => String(c[0]).endsWith("/a2a"));
|
||||
const restartCalls = api.post.mock.calls.filter((c) => String(c[0]).endsWith("/restart"));
|
||||
expect(a2aCalls.length).toBe(2);
|
||||
expect(restartCalls.length).toBe(0);
|
||||
|
||||
// Each /a2a POST carries the canonical tasks/cancel envelope.
|
||||
for (const call of a2aCalls) {
|
||||
expect(call[1]).toEqual({ method: "tasks/cancel", params: {} });
|
||||
}
|
||||
|
||||
// Release the gate so the test cleanup doesn't dangle.
|
||||
resolveCancels();
|
||||
await advanceUntilSettled(10_000);
|
||||
});
|
||||
|
||||
it("phase 2: when activeTasks drains to 0 during the poll window, /restart is NOT called", async () => {
|
||||
seedTwoActive();
|
||||
api.post.mockResolvedValue(undefined);
|
||||
|
||||
render(<Toolbar />);
|
||||
fireEvent.click(screen.getByRole("button", { name: /stop all running tasks/i }));
|
||||
|
||||
// Let phase 1 fire (the two tasks/cancel calls).
|
||||
await Promise.resolve();
|
||||
await Promise.resolve();
|
||||
|
||||
// Simulate the platform pushing TASK_UPDATED with active_tasks=0
|
||||
// on both workspaces — emulate by mutating the store directly,
|
||||
// which is what canvas-events.ts does in production.
|
||||
defaultStore.nodes = toStoreNodes(makeNodes(["online", "online"], [0, 0]));
|
||||
|
||||
// Advance past the first poll interval (250ms) so the loop sees
|
||||
// the drained store and exits early.
|
||||
await advanceUntilSettled(400);
|
||||
// Drain any remaining timers so the handler returns cleanly.
|
||||
await advanceUntilSettled(10_000);
|
||||
|
||||
const restartCalls = api.post.mock.calls.filter((c) => String(c[0]).endsWith("/restart"));
|
||||
expect(restartCalls.length).toBe(0);
|
||||
});
|
||||
|
||||
it("phase 3: when activeTasks does NOT drain inside the timeout, falls through to /restart for each stuck workspace", async () => {
|
||||
seedTwoActive();
|
||||
api.post.mockResolvedValue(undefined);
|
||||
|
||||
render(<Toolbar />);
|
||||
fireEvent.click(screen.getByRole("button", { name: /stop all running tasks/i }));
|
||||
|
||||
// Phase 1 dispatch.
|
||||
await Promise.resolve();
|
||||
await Promise.resolve();
|
||||
|
||||
// Do NOT drain — activeTasks stays at 2 for both. Advance past the
|
||||
// 8000ms drain timeout plus a buffer so phase 3's /restart POSTs fire.
|
||||
await advanceUntilSettled(9_000);
|
||||
await advanceUntilSettled(1_000);
|
||||
|
||||
const a2aCalls = api.post.mock.calls.filter((c) => String(c[0]).endsWith("/a2a"));
|
||||
const restartCalls = api.post.mock.calls.filter((c) => String(c[0]).endsWith("/restart"));
|
||||
expect(a2aCalls.length).toBe(2);
|
||||
expect(restartCalls.length).toBe(2);
|
||||
|
||||
// Order check: every /a2a call comes before every /restart call.
|
||||
const lastA2AIdx = Math.max(
|
||||
...api.post.mock.calls.map((c, i) => (String(c[0]).endsWith("/a2a") ? i : -1))
|
||||
);
|
||||
const firstRestartIdx = Math.min(
|
||||
...api.post.mock.calls.map((c, i) => (String(c[0]).endsWith("/restart") ? i : Infinity))
|
||||
);
|
||||
expect(lastA2AIdx).toBeLessThan(firstRestartIdx);
|
||||
});
|
||||
|
||||
it("phase 3 selective: drains only one of two workspaces — /restart is called only for the stuck one", async () => {
|
||||
seedTwoActive();
|
||||
api.post.mockResolvedValue(undefined);
|
||||
|
||||
render(<Toolbar />);
|
||||
fireEvent.click(screen.getByRole("button", { name: /stop all running tasks/i }));
|
||||
|
||||
await Promise.resolve();
|
||||
await Promise.resolve();
|
||||
|
||||
// ws-0 drains immediately, ws-1 stays stuck for the full timeout.
|
||||
defaultStore.nodes = toStoreNodes(makeNodes(["online", "online"], [0, 2]));
|
||||
await advanceUntilSettled(9_500);
|
||||
|
||||
const restartCalls = api.post.mock.calls.filter((c) => String(c[0]).endsWith("/restart"));
|
||||
expect(restartCalls.length).toBe(1);
|
||||
expect(restartCalls[0][0]).toBe("/workspaces/ws-1/restart");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -0,0 +1,181 @@
|
||||
'use client';
|
||||
|
||||
import { useCallback, useEffect, useState } from 'react';
|
||||
import { api } from '@/lib/api';
|
||||
import { fetchSession, type Session } from '@/lib/auth';
|
||||
import { getTenantSlug } from '@/lib/tenant';
|
||||
import { Spinner } from '@/components/Spinner';
|
||||
|
||||
/**
|
||||
* Organization-identity surface inside SettingsPanel.
|
||||
*
|
||||
* Closes a chronic UX gap where users (and our own AI agents) had to
|
||||
* call /cp/auth/me or /cp/orgs from browser devtools to read their
|
||||
* org_id UUID. Now: a copy-buttoned view of name + slug + UUID for the
|
||||
* currently-active org, plus a switcher list when the user belongs to
|
||||
* multiple orgs.
|
||||
*
|
||||
* Data path:
|
||||
* 1. fetchSession() → /cp/auth/me → current org_id
|
||||
* 2. api.get('/cp/orgs') → list of all orgs the user belongs to
|
||||
* 3. Match by id === session.org_id; fall back to host-slug match
|
||||
* if the session probe loses the race.
|
||||
*
|
||||
* Read-only — this tab never mutates. Org creation/switching lives at
|
||||
* /orgs (the post-signup landing page).
|
||||
*/
|
||||
|
||||
interface Org {
|
||||
id: string;
|
||||
slug: string;
|
||||
name: string;
|
||||
status?: string;
|
||||
}
|
||||
|
||||
// /cp/orgs may return a bare array or {orgs: []} — see orgs/page.tsx
|
||||
// for the same defensive unwrap.
|
||||
type OrgsResponse = Org[] | { orgs?: Org[] };
|
||||
|
||||
export function OrgInfoTab() {
|
||||
const [orgs, setOrgs] = useState<Org[] | null>(null);
|
||||
const [session, setSession] = useState<Session | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
(async () => {
|
||||
try {
|
||||
const [sess, body] = await Promise.all([
|
||||
fetchSession().catch(() => null),
|
||||
api.get<OrgsResponse>('/cp/orgs'),
|
||||
]);
|
||||
if (cancelled) return;
|
||||
setSession(sess);
|
||||
setOrgs(Array.isArray(body) ? body : body.orgs ?? []);
|
||||
} catch (e) {
|
||||
if (!cancelled) setError(e instanceof Error ? e.message : 'Failed to load org info');
|
||||
} finally {
|
||||
if (!cancelled) setLoading(false);
|
||||
}
|
||||
})();
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, []);
|
||||
|
||||
const tenantSlug = getTenantSlug();
|
||||
const currentOrg =
|
||||
orgs?.find((o) => session && o.id === session.org_id) ??
|
||||
orgs?.find((o) => tenantSlug && o.slug === tenantSlug) ??
|
||||
null;
|
||||
const otherOrgs = orgs?.filter((o) => o.id !== currentOrg?.id) ?? [];
|
||||
|
||||
if (loading) {
|
||||
return (
|
||||
<div
|
||||
role="status"
|
||||
aria-live="polite"
|
||||
className="flex items-center justify-center gap-2 py-6 text-ink-mid text-xs"
|
||||
>
|
||||
<Spinner /> Loading organization…
|
||||
</div>
|
||||
);
|
||||
}
|
||||
if (error) {
|
||||
return (
|
||||
<div className="p-4">
|
||||
<div className="px-3 py-2 bg-red-950/40 border border-red-800/50 rounded-lg text-[10px] text-bad">
|
||||
{error}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
if (!currentOrg) {
|
||||
return (
|
||||
<div className="p-4">
|
||||
<p className="text-xs text-ink-mid">
|
||||
No organization found for this session. If this is unexpected, sign out and back in, or visit{' '}
|
||||
<a href="/orgs" className="underline">/orgs</a>.
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="p-4 space-y-4">
|
||||
<div>
|
||||
<h3 className="text-sm font-semibold text-ink mb-1">Current Organization</h3>
|
||||
<p className="text-[10px] text-ink-mid leading-relaxed">
|
||||
IDs you can paste into API calls, support tickets, or CLI arguments. The UUID never changes;
|
||||
the slug is the URL subdomain.
|
||||
</p>
|
||||
</div>
|
||||
<OrgIdentityCard org={currentOrg} highlighted />
|
||||
{otherOrgs.length > 0 && (
|
||||
<div className="space-y-2 pt-2">
|
||||
<h4 className="text-[11px] font-semibold text-ink-mid uppercase tracking-wider">
|
||||
Your other organizations ({otherOrgs.length})
|
||||
</h4>
|
||||
{otherOrgs.map((o) => (
|
||||
<OrgIdentityCard key={o.id} org={o} />
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function OrgIdentityCard({ org, highlighted }: { org: Org; highlighted?: boolean }) {
|
||||
return (
|
||||
<div
|
||||
className={`rounded-lg border p-3 space-y-2 ${
|
||||
highlighted ? 'border-accent/40 bg-accent-strong/5' : 'border-line/40 bg-surface-card/40'
|
||||
}`}
|
||||
data-testid={`org-card-${org.slug}`}
|
||||
>
|
||||
<div className="flex items-baseline justify-between gap-2">
|
||||
<span className="text-[12px] font-medium text-ink truncate">{org.name}</span>
|
||||
{org.status && (
|
||||
<span className="text-[9px] text-ink-mid uppercase tracking-wider shrink-0">{org.status}</span>
|
||||
)}
|
||||
</div>
|
||||
<IdentityRow label="Slug" value={org.slug} />
|
||||
<IdentityRow label="UUID" value={org.id} mono />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function IdentityRow({ label, value, mono }: { label: string; value: string; mono?: boolean }) {
|
||||
const [copied, setCopied] = useState(false);
|
||||
const onCopy = useCallback(() => {
|
||||
// Best-effort: jsdom + old Safari throw synchronously on writeText.
|
||||
try {
|
||||
navigator.clipboard.writeText(value);
|
||||
} catch {
|
||||
/* user can still triple-click select */
|
||||
}
|
||||
setCopied(true);
|
||||
setTimeout(() => setCopied(false), 2000);
|
||||
}, [value]);
|
||||
return (
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-[10px] text-ink-mid w-10 shrink-0">{label}</span>
|
||||
<code
|
||||
className={`flex-1 text-[11px] text-ink bg-surface-sunken/60 px-2 py-1 rounded select-all break-all ${
|
||||
mono ? 'font-mono' : ''
|
||||
}`}
|
||||
>
|
||||
{value}
|
||||
</code>
|
||||
<button
|
||||
type="button"
|
||||
onClick={onCopy}
|
||||
aria-label={`Copy ${label}`}
|
||||
className="shrink-0 px-2 py-1 bg-surface-card/60 hover:bg-surface-card border border-line/40 rounded text-[10px] text-ink-mid hover:text-ink transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
|
||||
>
|
||||
{copied ? 'Copied' : 'Copy'}
|
||||
</button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -8,6 +8,7 @@ import { useKeyboardShortcut } from '@/hooks/use-keyboard-shortcut';
|
||||
import { SecretsTab } from './SecretsTab';
|
||||
import { TokensTab } from './TokensTab';
|
||||
import { OrgTokensTab } from './OrgTokensTab';
|
||||
import { OrgInfoTab } from './OrgInfoTab';
|
||||
import { UnsavedChangesGuard } from './UnsavedChangesGuard';
|
||||
|
||||
/** Module-level ref so TopBar's SettingsButton can receive focus back on close. */
|
||||
@@ -116,6 +117,9 @@ export function SettingsPanel({ workspaceId }: SettingsPanelProps) {
|
||||
<Tabs.Trigger value="org-tokens" className="settings-panel__tab">
|
||||
Org API Keys
|
||||
</Tabs.Trigger>
|
||||
<Tabs.Trigger value="org-info" className="settings-panel__tab">
|
||||
Organization
|
||||
</Tabs.Trigger>
|
||||
</Tabs.List>
|
||||
|
||||
<Tabs.Content value="api-keys" className="settings-panel__content">
|
||||
@@ -129,6 +133,10 @@ export function SettingsPanel({ workspaceId }: SettingsPanelProps) {
|
||||
<Tabs.Content value="org-tokens" className="settings-panel__content">
|
||||
<OrgTokensTab />
|
||||
</Tabs.Content>
|
||||
|
||||
<Tabs.Content value="org-info" className="settings-panel__content">
|
||||
<OrgInfoTab />
|
||||
</Tabs.Content>
|
||||
</Tabs.Root>
|
||||
|
||||
<div className="settings-panel__footer">
|
||||
|
||||
@@ -0,0 +1,207 @@
|
||||
// @vitest-environment jsdom
|
||||
/**
|
||||
* Tests for OrgInfoTab — surfaces current org name/slug/UUID with copy
|
||||
* buttons, plus a list of the user's other orgs when applicable.
|
||||
*
|
||||
* Covers (≥3 cases per the closing-the-UX-gap brief):
|
||||
* - Loading state (spinner + aria-live)
|
||||
* - Renders current org matched by session.org_id, with UUID + slug + name
|
||||
* - Copy button writes the UUID to navigator.clipboard
|
||||
* - Falls back to host-slug match when session lookup fails
|
||||
* - Lists other orgs when user belongs to multiple
|
||||
* - Error banner when /cp/orgs throws
|
||||
* - Empty/no-match state renders the recovery hint, not a crash
|
||||
*/
|
||||
import React from "react";
|
||||
import { render, screen, fireEvent, cleanup, act, waitFor } from "@testing-library/react";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { OrgInfoTab } from "../OrgInfoTab";
|
||||
|
||||
const mockGet = vi.fn();
|
||||
const mockFetchSession = vi.fn();
|
||||
const mockGetTenantSlug = vi.fn();
|
||||
|
||||
vi.mock("@/lib/api", () => ({
|
||||
api: { get: (...args: unknown[]) => mockGet(...args) },
|
||||
}));
|
||||
vi.mock("@/lib/auth", () => ({
|
||||
fetchSession: (...args: unknown[]) => mockFetchSession(...args),
|
||||
}));
|
||||
vi.mock("@/lib/tenant", () => ({
|
||||
getTenantSlug: (...args: unknown[]) => mockGetTenantSlug(...args),
|
||||
}));
|
||||
|
||||
// Stub clipboard
|
||||
vi.stubGlobal("navigator", {
|
||||
clipboard: { writeText: vi.fn().mockResolvedValue(undefined) },
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
vi.useRealTimers();
|
||||
mockGet.mockReset();
|
||||
mockFetchSession.mockReset();
|
||||
mockGetTenantSlug.mockReset();
|
||||
mockGetTenantSlug.mockReturnValue("");
|
||||
vi.mocked(navigator.clipboard.writeText).mockReset();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
cleanup();
|
||||
});
|
||||
|
||||
async function flush() {
|
||||
await act(async () => {
|
||||
await Promise.resolve();
|
||||
await Promise.resolve();
|
||||
});
|
||||
}
|
||||
|
||||
const AGENTS_TEAM = {
|
||||
id: "2355b568-0799-4cc7-9e7f-806747f9958c",
|
||||
slug: "agents-team",
|
||||
name: "Agents Team",
|
||||
status: "running",
|
||||
};
|
||||
const OTHER_ORG = {
|
||||
id: "11111111-1111-4111-8111-111111111111",
|
||||
slug: "skunkworks",
|
||||
name: "Skunkworks",
|
||||
status: "running",
|
||||
};
|
||||
|
||||
// ─── Loading ─────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("OrgInfoTab — loading", () => {
|
||||
it("shows spinner while fetching", () => {
|
||||
mockGet.mockImplementation(() => new Promise(() => {}));
|
||||
mockFetchSession.mockImplementation(() => new Promise(() => {}));
|
||||
render(<OrgInfoTab />);
|
||||
const status = screen.getByRole("status");
|
||||
expect(status).toBeTruthy();
|
||||
expect(status.getAttribute("aria-live")).toBe("polite");
|
||||
expect(status.textContent).toContain("Loading organization");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Current org renders + copy ──────────────────────────────────────────────
|
||||
|
||||
describe("OrgInfoTab — current org", () => {
|
||||
it("renders the org matched by session.org_id with name, slug, UUID", async () => {
|
||||
mockFetchSession.mockResolvedValue({
|
||||
user_id: "u-1",
|
||||
org_id: AGENTS_TEAM.id,
|
||||
email: "hongming@moleculesai.app",
|
||||
});
|
||||
mockGet.mockResolvedValue([AGENTS_TEAM, OTHER_ORG]);
|
||||
|
||||
render(<OrgInfoTab />);
|
||||
await flush();
|
||||
await waitFor(() => screen.getByText("Current Organization"));
|
||||
|
||||
// Name shown
|
||||
expect(screen.getByText("Agents Team")).toBeTruthy();
|
||||
// Slug shown
|
||||
expect(screen.getByText("agents-team")).toBeTruthy();
|
||||
// UUID shown
|
||||
expect(screen.getByText(AGENTS_TEAM.id)).toBeTruthy();
|
||||
});
|
||||
|
||||
it("copy-UUID button writes the UUID to navigator.clipboard", async () => {
|
||||
mockFetchSession.mockResolvedValue({
|
||||
user_id: "u-1",
|
||||
org_id: AGENTS_TEAM.id,
|
||||
email: "hongming@moleculesai.app",
|
||||
});
|
||||
mockGet.mockResolvedValue([AGENTS_TEAM]);
|
||||
|
||||
render(<OrgInfoTab />);
|
||||
await flush();
|
||||
await waitFor(() => screen.getByText(AGENTS_TEAM.id));
|
||||
|
||||
const copyUuid = screen.getByRole("button", { name: /Copy UUID/i });
|
||||
fireEvent.click(copyUuid);
|
||||
|
||||
expect(navigator.clipboard.writeText).toHaveBeenCalledWith(AGENTS_TEAM.id);
|
||||
// Optimistic "Copied" label flip
|
||||
await waitFor(() =>
|
||||
expect(
|
||||
screen.getByRole("button", { name: /Copy UUID/i }).textContent,
|
||||
).toContain("Copied"),
|
||||
);
|
||||
});
|
||||
|
||||
it("copy-Slug button writes the slug to navigator.clipboard", async () => {
|
||||
mockFetchSession.mockResolvedValue({
|
||||
user_id: "u-1",
|
||||
org_id: AGENTS_TEAM.id,
|
||||
email: "hongming@moleculesai.app",
|
||||
});
|
||||
mockGet.mockResolvedValue([AGENTS_TEAM]);
|
||||
|
||||
render(<OrgInfoTab />);
|
||||
await flush();
|
||||
await waitFor(() => screen.getByText(AGENTS_TEAM.slug));
|
||||
|
||||
fireEvent.click(screen.getByRole("button", { name: /Copy Slug/i }));
|
||||
expect(navigator.clipboard.writeText).toHaveBeenCalledWith(AGENTS_TEAM.slug);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Fallback: host-slug match when session fails ────────────────────────────
|
||||
|
||||
describe("OrgInfoTab — fallbacks", () => {
|
||||
it("falls back to host-slug match when fetchSession rejects", async () => {
|
||||
mockFetchSession.mockRejectedValue(new Error("session probe failed"));
|
||||
mockGetTenantSlug.mockReturnValue("agents-team");
|
||||
mockGet.mockResolvedValue({ orgs: [AGENTS_TEAM, OTHER_ORG] }); // wrapped shape
|
||||
|
||||
render(<OrgInfoTab />);
|
||||
await flush();
|
||||
await waitFor(() => screen.getByText("Current Organization"));
|
||||
|
||||
expect(screen.getByText("Agents Team")).toBeTruthy();
|
||||
expect(screen.getByText(AGENTS_TEAM.id)).toBeTruthy();
|
||||
});
|
||||
|
||||
it("lists other orgs the user belongs to under a separate header", async () => {
|
||||
mockFetchSession.mockResolvedValue({
|
||||
user_id: "u-1",
|
||||
org_id: AGENTS_TEAM.id,
|
||||
email: "hongming@moleculesai.app",
|
||||
});
|
||||
mockGet.mockResolvedValue([AGENTS_TEAM, OTHER_ORG]);
|
||||
|
||||
render(<OrgInfoTab />);
|
||||
await flush();
|
||||
await waitFor(() => screen.getByText(/Your other organizations/));
|
||||
|
||||
expect(screen.getByText("Skunkworks")).toBeTruthy();
|
||||
expect(screen.getByText(OTHER_ORG.id)).toBeTruthy();
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Error + empty handling ──────────────────────────────────────────────────
|
||||
|
||||
describe("OrgInfoTab — error + empty", () => {
|
||||
it("renders an error banner when /cp/orgs throws", async () => {
|
||||
mockFetchSession.mockResolvedValue(null);
|
||||
mockGet.mockRejectedValue(new Error("API GET /cp/orgs: 500 boom"));
|
||||
|
||||
render(<OrgInfoTab />);
|
||||
await flush();
|
||||
await waitFor(() => screen.getByText(/500 boom/));
|
||||
expect(screen.queryByText("Current Organization")).toBeNull();
|
||||
});
|
||||
|
||||
it("renders the recovery hint when no org matches (no crash)", async () => {
|
||||
mockFetchSession.mockResolvedValue(null);
|
||||
mockGetTenantSlug.mockReturnValue("");
|
||||
mockGet.mockResolvedValue([]);
|
||||
|
||||
render(<OrgInfoTab />);
|
||||
await flush();
|
||||
await waitFor(() =>
|
||||
screen.getByText(/No organization found for this session/),
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -8,3 +8,4 @@ export { SearchBar } from './SearchBar';
|
||||
export { EmptyState } from './EmptyState';
|
||||
export { DeleteConfirmDialog } from './DeleteConfirmDialog';
|
||||
export { UnsavedChangesGuard } from './UnsavedChangesGuard';
|
||||
export { OrgInfoTab } from './OrgInfoTab';
|
||||
|
||||
@@ -649,7 +649,17 @@ function WaitingBubbles({ visible }: { visible: CommMessage[] }) {
|
||||
if (!prev || m.timestamp > prev.timestamp) tailByPeer.set(m.peerId, m);
|
||||
}
|
||||
const waitingPeers = Array.from(tailByPeer.values()).filter(
|
||||
(m) => m.flow === "out" && (m.status === "pending" || m.status === "queued"),
|
||||
// Task #227 — also light the indicator for status="dispatched": that's
|
||||
// the platform's marker for a poll-mode delegation that's been
|
||||
// recorded into the peer's inbox but not yet picked up. Without this
|
||||
// arm, external/MCP peer threads showed an outbound bubble and then
|
||||
// dead silence until the eventual reply landed — no parity with the
|
||||
// native push-path "pending" indicator.
|
||||
(m) =>
|
||||
m.flow === "out" &&
|
||||
(m.status === "pending" ||
|
||||
m.status === "queued" ||
|
||||
m.status === "dispatched"),
|
||||
);
|
||||
if (waitingPeers.length === 0) return null;
|
||||
return (
|
||||
@@ -688,7 +698,9 @@ function WaitingBubbles({ visible }: { visible: CommMessage[] }) {
|
||||
<span className="text-[10px]">
|
||||
{m.status === "queued"
|
||||
? `${m.peerName} is busy — reply will arrive when they're free`
|
||||
: `Waiting for ${m.peerName}…`}
|
||||
: m.status === "dispatched"
|
||||
? `Queued — ${m.peerName} will pick up on next poll`
|
||||
: `Waiting for ${m.peerName}…`}
|
||||
</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@@ -41,6 +41,19 @@ describe("inferA2AErrorHint", () => {
|
||||
expect(inferA2AErrorHint("RuntimeException in tool call")).toMatch(/runtime threw an exception/);
|
||||
});
|
||||
|
||||
it("points at the Activity tab (the real in-product logs surface), not 'workspace/container logs' (internal#212)", () => {
|
||||
// Pre-#212 these hints sent users to "workspace logs" / "container
|
||||
// logs" — neither has a UI affordance in the canvas. Activity tab
|
||||
// is the in-product surface where the full row lives. Lock the
|
||||
// copy so a future refactor cannot re-introduce the dangling
|
||||
// pointer.
|
||||
expect(inferA2AErrorHint("Agent error: boom")).toMatch(/Activity tab/);
|
||||
expect(inferA2AErrorHint("some completely novel error nobody has matched yet")).toMatch(/Activity tab/);
|
||||
// And the two strings together must not regress to the old text.
|
||||
expect(inferA2AErrorHint("Agent error: boom")).not.toMatch(/container logs/);
|
||||
expect(inferA2AErrorHint("some novel error")).not.toMatch(/workspace logs/);
|
||||
});
|
||||
|
||||
it("recognises peer-unreachable cases (Activity-tab originals)", () => {
|
||||
expect(inferA2AErrorHint("workspace not found")).toMatch(/can't be reached/);
|
||||
expect(inferA2AErrorHint("not accessible")).toMatch(/can't be reached/);
|
||||
@@ -53,7 +66,8 @@ describe("inferA2AErrorHint", () => {
|
||||
|
||||
it("returns a generic fallback for unrecognised text", () => {
|
||||
const hint = inferA2AErrorHint("some completely novel error nobody has matched yet");
|
||||
expect(hint).toMatch(/Check the workspace logs|delivery failure/);
|
||||
// Fallback now sends the user to the Activity tab (post-#212).
|
||||
expect(hint).toMatch(/Activity tab|delivery failure/);
|
||||
});
|
||||
|
||||
it("Claude SDK wedge wins over the more general timeout pattern", () => {
|
||||
|
||||
@@ -38,7 +38,11 @@ export function inferA2AErrorHint(detail: string): string {
|
||||
return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip — retry once. If it repeats, the remote container may have crashed mid-request; check its logs.";
|
||||
}
|
||||
if (t.includes("agent error") || t.includes("exception")) {
|
||||
return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
|
||||
// internal#212 closeout: end users have no "container logs" surface
|
||||
// in the canvas; the Activity tab IS the user-visible logs surface
|
||||
// (full row carries request/response body + error_detail). Point
|
||||
// there so the hint is actionable from inside the product.
|
||||
return "The remote agent's runtime threw an exception. Open the Activity tab for the full row (request body, response, error_detail) — Restart usually clears transient runtime crashes.";
|
||||
}
|
||||
if (
|
||||
t.includes("not found") ||
|
||||
@@ -50,5 +54,9 @@ export function inferA2AErrorHint(detail: string): string {
|
||||
if (detail === "") {
|
||||
return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
|
||||
}
|
||||
return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
|
||||
// internal#212 closeout: "workspace logs" pointed at a tab that does
|
||||
// not exist — Activity tab is the in-product logs surface. Keep the
|
||||
// hint generic enough for the unrecognised-detail fallback but point
|
||||
// the user at a real affordance.
|
||||
return "The remote agent reported a delivery failure. Open the Activity tab for the full row, or try restarting the workspace.";
|
||||
}
|
||||
|
||||
@@ -0,0 +1,178 @@
|
||||
// @vitest-environment jsdom
|
||||
//
|
||||
// Task #227 — external/MCP workspace progress UX parity.
|
||||
//
|
||||
// ws-server's `proxyA2ARequest` poll-mode short-circuit
|
||||
// (workspace-server/internal/handlers/a2a_proxy.go:402-432) returns a
|
||||
// synthetic `{status:"queued", delivery_mode:"poll", method:"message/send"}`
|
||||
// HTTP 200 within ~50ms when the target workspace is registered with
|
||||
// `delivery_mode=poll` — i.e. an operator's laptop running
|
||||
// `molecule-mcp-claude-channel`, a hermes/codex MCP bridge, or a Cursor
|
||||
// MCP client. The real agent reply arrives separately via the
|
||||
// AGENT_MESSAGE WebSocket event after the agent's next
|
||||
// `wait_for_message` poll (could be 1s, could be 60s).
|
||||
//
|
||||
// Pre-#227 behaviour: useChatSend treated the queued-200 as a successful
|
||||
// round-trip — extractReplyText returned "", no agent bubble was
|
||||
// created, `releaseSendGuards` flipped `sending` off, and the user saw
|
||||
// dead silence between their user bubble and the eventual reply with
|
||||
// NO progress indicator. That's the user-reported gap this task fixes.
|
||||
//
|
||||
// These tests pin the new behaviour: on a queued-200, the hook MUST NOT
|
||||
// call onAgentMessage (no empty bubble) AND MUST NOT call
|
||||
// releaseSendGuards (spinner persists). The eventual AGENT_MESSAGE WS
|
||||
// event is what clears the spinner — that path is covered by
|
||||
// useChatSocket.test.tsx already.
|
||||
|
||||
import { describe, it, expect, vi, beforeEach } from "vitest";
|
||||
import { renderHook, act } from "@testing-library/react";
|
||||
|
||||
// Capture the api.post invocations + control responses per-test.
|
||||
const apiPostMock = vi.fn<
|
||||
(url: string, body?: unknown, opts?: unknown) => Promise<unknown>
|
||||
>();
|
||||
vi.mock("@/lib/api", () => ({
|
||||
api: {
|
||||
post: (url: string, body?: unknown, opts?: unknown) =>
|
||||
apiPostMock(url, body, opts),
|
||||
get: vi.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
// uploads — tests don't go through the upload path; stub the helpers
|
||||
// useChatSend imports so the module loads.
|
||||
vi.mock("../../uploads", () => ({
|
||||
uploadChatFiles: vi.fn(),
|
||||
FileTooLargeError: class FileTooLargeError extends Error {},
|
||||
}));
|
||||
|
||||
// types — re-export the createMessage helper unchanged; only the
|
||||
// uploads stub matters above.
|
||||
import { useChatSend } from "../useChatSend";
|
||||
|
||||
beforeEach(() => {
|
||||
apiPostMock.mockReset();
|
||||
});
|
||||
|
||||
describe("useChatSend — poll-mode (external/MCP) queued-200 handling — task #227", () => {
|
||||
it("does NOT call onAgentMessage when the synthetic {status:'queued'} response lands (no empty bubble)", async () => {
|
||||
// Mock the platform's poll-mode short-circuit response shape exactly
|
||||
// as ws-server's `proxyA2ARequest` returns it (a2a_proxy.go:420-431).
|
||||
apiPostMock.mockResolvedValueOnce({
|
||||
status: "queued",
|
||||
delivery_mode: "poll",
|
||||
method: "message/send",
|
||||
});
|
||||
|
||||
const onUserMessage = vi.fn();
|
||||
const onAgentMessage = vi.fn();
|
||||
|
||||
const { result } = renderHook(() =>
|
||||
useChatSend("ws-poll-target", {
|
||||
getHistoryMessages: () => [],
|
||||
onUserMessage,
|
||||
onAgentMessage,
|
||||
}),
|
||||
);
|
||||
|
||||
await act(async () => {
|
||||
await result.current.sendMessage("hello external workspace");
|
||||
// Yield one microtask so the .then runs.
|
||||
await Promise.resolve();
|
||||
});
|
||||
|
||||
// User bubble fires — the user typed, that part is unconditional.
|
||||
expect(onUserMessage).toHaveBeenCalledTimes(1);
|
||||
// CRITICAL: no agent bubble. extractReplyText on a queued envelope
|
||||
// returns "" — the pre-#227 code would still have hit the
|
||||
// "releaseSendGuards + no bubble" path, BUT it would have ended
|
||||
// `sending`. The new code returns early BEFORE that release, so the
|
||||
// contract under test is "no synthesised empty bubble".
|
||||
expect(onAgentMessage).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("keeps `sending` true after a queued-200 — the spinner must persist until the real AGENT_MESSAGE arrives", async () => {
|
||||
apiPostMock.mockResolvedValueOnce({
|
||||
status: "queued",
|
||||
delivery_mode: "poll",
|
||||
method: "message/send",
|
||||
});
|
||||
|
||||
const { result } = renderHook(() =>
|
||||
useChatSend("ws-poll-target", {
|
||||
getHistoryMessages: () => [],
|
||||
}),
|
||||
);
|
||||
|
||||
await act(async () => {
|
||||
await result.current.sendMessage("waiting for the operator laptop");
|
||||
await Promise.resolve();
|
||||
});
|
||||
|
||||
// The spinner-driving state is `sending`. On a queued-200, it must
|
||||
// remain true — clearing it here is the exact bug task #227
|
||||
// resurfaces (collapsing the spinner before the agent has even seen
|
||||
// the message).
|
||||
expect(result.current.sending).toBe(true);
|
||||
});
|
||||
|
||||
it("ALSO keeps `sending` true even after a follow-up microtask flush — guards against an accidental late release", async () => {
|
||||
// Defense: ensure no chained .then / .finally accidentally calls
|
||||
// releaseSendGuards on the queued path. Run several microtask
|
||||
// ticks and re-assert.
|
||||
apiPostMock.mockResolvedValueOnce({
|
||||
status: "queued",
|
||||
delivery_mode: "poll",
|
||||
});
|
||||
|
||||
const { result } = renderHook(() =>
|
||||
useChatSend("ws-poll-target", {
|
||||
getHistoryMessages: () => [],
|
||||
}),
|
||||
);
|
||||
|
||||
await act(async () => {
|
||||
await result.current.sendMessage("late-release-guard");
|
||||
// Flush multiple microtask ticks.
|
||||
await Promise.resolve();
|
||||
await Promise.resolve();
|
||||
await Promise.resolve();
|
||||
});
|
||||
|
||||
expect(result.current.sending).toBe(true);
|
||||
});
|
||||
|
||||
it("push-mode (real reply parts) still flips sending=false + creates an agent bubble — non-regression for the default path", async () => {
|
||||
// Sanity-check the push path still works: a real reply must call
|
||||
// onAgentMessage and flip sending=false. Without this assertion an
|
||||
// overzealous "return early on any non-result body" would silently
|
||||
// break the dominant push-mode path.
|
||||
apiPostMock.mockResolvedValueOnce({
|
||||
result: {
|
||||
parts: [{ kind: "text", text: "hi from native workspace" }],
|
||||
},
|
||||
});
|
||||
|
||||
const onAgentMessage = vi.fn();
|
||||
const { result } = renderHook(() =>
|
||||
useChatSend("ws-native-push", {
|
||||
getHistoryMessages: () => [],
|
||||
onAgentMessage,
|
||||
}),
|
||||
);
|
||||
|
||||
await act(async () => {
|
||||
await result.current.sendMessage("native push test");
|
||||
await Promise.resolve();
|
||||
});
|
||||
|
||||
expect(onAgentMessage).toHaveBeenCalledTimes(1);
|
||||
const msg = onAgentMessage.mock.calls[0][0] as {
|
||||
role: string;
|
||||
content: string;
|
||||
};
|
||||
expect(msg.role).toBe("agent");
|
||||
expect(msg.content).toBe("hi from native workspace");
|
||||
expect(result.current.sending).toBe(false);
|
||||
});
|
||||
});
|
||||
@@ -116,6 +116,77 @@ describe("useChatSocket — surface error_detail to onSendError (internal#212)",
|
||||
expect(reason.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
// Task #227 — external/MCP (poll-mode) workspace progress UX.
|
||||
//
|
||||
// ws-server's `proxyA2ARequest` poll-mode short-circuit fires the
|
||||
// ACTIVITY_LOGGED a2a_receive with status="ok" and NO duration_ms (no
|
||||
// reply yet — the request is queued for the agent's next poll). Before
|
||||
// task #227 the (status==="ok" && durationMs) guard silently dropped
|
||||
// this row, so the chat UI had ZERO progress signal between "user
|
||||
// typed" and "agent eventually polled and replied". Lock the queued
|
||||
// line in so future refactors don't regress to the silent-drop state.
|
||||
it("emits a 'queued — will pick up on next poll' activity line when a2a_receive status=ok has no duration_ms (poll-mode)", () => {
|
||||
const onActivityLog = vi.fn();
|
||||
renderHook(() =>
|
||||
useChatSocket("ws-self", {
|
||||
onActivityLog,
|
||||
}),
|
||||
);
|
||||
|
||||
expect(capturedHandler).not.toBeNull();
|
||||
act(() => {
|
||||
capturedHandler!({
|
||||
event: "ACTIVITY_LOGGED",
|
||||
workspace_id: "ws-self",
|
||||
payload: {
|
||||
activity_type: "a2a_receive",
|
||||
method: "message/send",
|
||||
status: "ok",
|
||||
target_id: "ws-self",
|
||||
// No duration_ms — this is the queued-for-poll signal.
|
||||
},
|
||||
timestamp: "2026-05-20T00:00:00Z",
|
||||
});
|
||||
});
|
||||
|
||||
expect(onActivityLog).toHaveBeenCalledTimes(1);
|
||||
const line = onActivityLog.mock.calls[0][0] as string;
|
||||
// The line MUST be present (not the empty-string silent-drop pattern)
|
||||
// and MUST mention the queued state so the user has actionable signal.
|
||||
expect(line.length).toBeGreaterThan(0);
|
||||
expect(line.toLowerCase()).toMatch(/queued|poll/);
|
||||
});
|
||||
|
||||
// Pair with the above: poll-mode acknowledgement must NOT prematurely
|
||||
// call onSendComplete — the spinner has to stay up until the actual
|
||||
// AGENT_MESSAGE reply lands. (The reply-success path with duration_ms
|
||||
// still calls onSendComplete; that's the push-mode case.)
|
||||
it("does NOT call onSendComplete on a poll-mode queued a2a_receive (spinner must persist)", () => {
|
||||
const onSendComplete = vi.fn();
|
||||
renderHook(() =>
|
||||
useChatSocket("ws-self", {
|
||||
onSendComplete,
|
||||
}),
|
||||
);
|
||||
|
||||
act(() => {
|
||||
capturedHandler!({
|
||||
event: "ACTIVITY_LOGGED",
|
||||
workspace_id: "ws-self",
|
||||
payload: {
|
||||
activity_type: "a2a_receive",
|
||||
method: "message/send",
|
||||
status: "ok",
|
||||
target_id: "ws-self",
|
||||
// No duration_ms.
|
||||
},
|
||||
timestamp: "2026-05-20T00:00:00Z",
|
||||
});
|
||||
});
|
||||
|
||||
expect(onSendComplete).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("ignores errors targeted at a different workspace's peer", () => {
|
||||
// Defense against a race where the WS hub fans out to all clients —
|
||||
// each chat panel must only react when target_id matches its own
|
||||
|
||||
@@ -22,6 +22,28 @@ interface A2AResponse {
|
||||
parts?: A2APart[];
|
||||
artifacts?: Array<{ parts: A2APart[] }>;
|
||||
};
|
||||
/** Set by ws-server's poll-mode short-circuit in `proxyA2ARequest`
|
||||
* (a2a_proxy.go:416-431) when the target workspace is registered as
|
||||
* `delivery_mode=poll` — e.g. an operator's laptop running
|
||||
* `molecule-mcp-claude-channel`, a hermes/codex MCP bridge, or a
|
||||
* Cursor MCP client. The HTTP 200 carries the synthetic envelope
|
||||
* `{status:"queued", delivery_mode:"poll", method:"message/send"}`
|
||||
* immediately (~50ms), BEFORE the agent has produced a reply.
|
||||
*
|
||||
* Task #227 routing: when this field is "queued" the caller must NOT
|
||||
* treat the 200 as "agent done" — there are no `result.parts` yet
|
||||
* (the reply will arrive separately via the AGENT_MESSAGE WS event
|
||||
* after the agent's next poll). Keep the spinner up; the eventual
|
||||
* AGENT_MESSAGE flips `sending` off via the existing useChatSocket
|
||||
* `onSendComplete` path. Without this distinction the spinner
|
||||
* disappeared immediately and external/MCP workspaces had no progress
|
||||
* UX between send and reply. */
|
||||
status?: string;
|
||||
/** Companion to `status` — "poll" when the queued short-circuit fired.
|
||||
* Defensive: we key the poll-mode-skip decision on status==="queued"
|
||||
* (the canonical signal) rather than on this field, but it's surfaced
|
||||
* here so future debugging / tests can assert on the full envelope. */
|
||||
delivery_mode?: string;
|
||||
}
|
||||
|
||||
export function extractReplyText(resp: A2AResponse): string {
|
||||
@@ -195,6 +217,30 @@ export function useChatSend(workspaceId: string, options: UseChatSendOptions) {
|
||||
sendInFlightRef.current = false;
|
||||
return;
|
||||
}
|
||||
// Task #227 — poll-mode (external/MCP workspace) queued-200
|
||||
// short-circuit. ws-server's `proxyA2ARequest` returns
|
||||
// `{status:"queued", delivery_mode:"poll", ...}` immediately
|
||||
// when the target has no URL (delivery_mode=poll), BEFORE the
|
||||
// agent has produced any reply. There is no `result.parts`
|
||||
// payload here — the actual reply will arrive separately via
|
||||
// the AGENT_MESSAGE WebSocket event after the agent's next
|
||||
// `wait_for_message` poll.
|
||||
//
|
||||
// Keep the spinner up by deliberately NOT calling
|
||||
// releaseSendGuards: the user-facing "thinking" state must
|
||||
// persist until the AGENT_MESSAGE lands (handled by the
|
||||
// useChatSocket `onAgentMessage`/`onSendComplete` path) or an
|
||||
// explicit error fires (`onSendError` from an ACTIVITY_LOGGED
|
||||
// status="error"). Don't synthesise an empty agent bubble.
|
||||
//
|
||||
// sendInFlightRef stays true intentionally — it's the dedup
|
||||
// guard for the user typing two messages back-to-back; for
|
||||
// poll mode the second message would race the first agent's
|
||||
// reply, so blocking is correct (matches push-mode behaviour
|
||||
// where `sending` blocks the textarea).
|
||||
if (resp?.status === "queued") {
|
||||
return;
|
||||
}
|
||||
const replyText = extractReplyText(resp);
|
||||
const replyFiles = extractFilesFromTask(
|
||||
(resp?.result ?? {}) as Record<string, unknown>,
|
||||
|
||||
@@ -62,6 +62,25 @@ export function useChatSocket(
|
||||
line = `← ${targetName} responded (${sec}s)`;
|
||||
const own = (targetId || msg.workspace_id) === workspaceId;
|
||||
if (own) callbacksRef.current.onSendComplete?.();
|
||||
} else if (status === "ok" && !durationMs) {
|
||||
// Task #227 — poll-mode (external/MCP workspace) queued receipt.
|
||||
// ws-server `logA2AReceiveQueued` writes a "received but no
|
||||
// reply yet" row with status="ok" and NO duration_ms, then
|
||||
// immediately returns the synthetic {status:"queued"} 200 to
|
||||
// the caller. Before this branch the row was silently dropped
|
||||
// by the (status==="ok" && durationMs) guard above — leaving
|
||||
// the chat UI with zero progress signal for the entire window
|
||||
// between "user typed" and "agent eventually polled and
|
||||
// replied". Surface the queued state explicitly so the user
|
||||
// sees acknowledgement (matches the queued-delegation
|
||||
// indicator in AgentCommsPanel.WaitingBubbles).
|
||||
//
|
||||
// We intentionally do NOT call onSendComplete here: the
|
||||
// outbound is not done — only acknowledged. The MyChatPanel
|
||||
// spinner stays up until the actual AGENT_MESSAGE reply lands
|
||||
// (poll path) or an explicit error fires (which still hits
|
||||
// the status==="error" branch below).
|
||||
line = `⧗ ${targetName} queued — agent will pick up on next poll`;
|
||||
} else if (status === "error") {
|
||||
line = `⚠ ${targetName} error`;
|
||||
const own = (targetId || msg.workspace_id) === workspaceId;
|
||||
|
||||
@@ -523,6 +523,9 @@ export function buildNodesAndEdges(
|
||||
// that don't yet include these columns in the GET response.
|
||||
broadcastEnabled: ws.broadcast_enabled ?? false,
|
||||
talkToUserEnabled: ws.talk_to_user_enabled ?? true,
|
||||
// A2A delivery mode (task #227). Absent on older ws-server builds
|
||||
// — leave undefined so the chat UI's "?? 'push'" fallback applies.
|
||||
deliveryMode: ws.delivery_mode,
|
||||
},
|
||||
};
|
||||
if (hasParent) {
|
||||
|
||||
@@ -106,6 +106,28 @@ export interface WorkspaceNodeData extends Record<string, unknown> {
|
||||
* send_message_to_user / POST /notify return 403 and the canvas
|
||||
* shows a "not enabled" state with a button to re-enable. Default true. */
|
||||
talkToUserEnabled?: boolean;
|
||||
/** A2A inbound delivery mode for this workspace — "push" (default —
|
||||
* synchronous HTTP dispatch by ws-server `proxyA2ARequest`) or "poll"
|
||||
* (workspace has no URL; ws-server logs the request and the agent
|
||||
* consumes it via `wait_for_message` / GET /activity?since_id=).
|
||||
*
|
||||
* Why surfaced to the UI: poll-mode targets (external/MCP workspaces:
|
||||
* `molecule-mcp-claude-channel` on an operator laptop, hermes/codex
|
||||
* bridge clients, Cursor MCP) acknowledge a canvas `message/send` with
|
||||
* a synthetic `{status:"queued"}` 200 within ~50ms. Without this flag
|
||||
* the chat UI cannot tell that gap from a real round-trip — the
|
||||
* spinner disappears immediately and the user sees dead silence until
|
||||
* the agent eventually polls and replies via the AGENT_MESSAGE WS
|
||||
* event (could be seconds, could be minutes). Task #227 — render a
|
||||
* "queued — agent will pick up on next poll" state for poll-mode
|
||||
* sends so external/MCP workspaces have progress UX parity with
|
||||
* native runtimes (claude-code / codex / hermes / openclaw).
|
||||
*
|
||||
* Sourced from the GET /workspaces response (`delivery_mode` snake_case
|
||||
* field, mapped here in canvas-topology.ts). Absent on older platform
|
||||
* builds — that fallthrough is treated as "push" to match
|
||||
* ws-server's `lookupDeliveryMode` default. */
|
||||
deliveryMode?: string;
|
||||
}
|
||||
|
||||
export type PanelTab = "details" | "skills" | "chat" | "terminal" | "config" | "schedule" | "channels" | "files" | "memory" | "traces" | "events" | "activity" | "audit";
|
||||
|
||||
@@ -342,6 +342,16 @@ export interface WorkspaceData {
|
||||
/** Workspace ability flags (migration 20260514). */
|
||||
broadcast_enabled?: boolean;
|
||||
talk_to_user_enabled?: boolean;
|
||||
/** A2A delivery mode for inbound messages — "push" (default, synchronous
|
||||
* HTTP dispatch to `url`) or "poll" (queued to activity_logs, agent
|
||||
* picks up via `wait_for_message` / GET /activity?since_id=). Surfaced
|
||||
* in the GET /workspaces response since #2339 PR 1; older platform
|
||||
* versions return it absent so the canvas treats absent as "push" (the
|
||||
* documented default in `lookupDeliveryMode`). Used by the chat UI to
|
||||
* render an "agent will pick up on next poll" indicator instead of
|
||||
* collapsing the spinner the moment the synchronous queued-200 returns
|
||||
* (task #227 — external/MCP workspaces had no progress UX). */
|
||||
delivery_mode?: string;
|
||||
}
|
||||
|
||||
let socket: ReconnectingSocket | null = null;
|
||||
|
||||
+1
-2
@@ -37,8 +37,7 @@
|
||||
{"name": "free-beats-all", "repo": "molecule-ai/molecule-ai-org-template-free-beats-all", "ref": "main"},
|
||||
{"name": "medo-smoke", "repo": "molecule-ai/molecule-ai-org-template-medo-smoke", "ref": "main"},
|
||||
{"name": "molecule-worker-gemini", "repo": "molecule-ai/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
|
||||
{"name": "ux-ab-lab", "repo": "molecule-ai/molecule-ai-org-template-ux-ab-lab", "ref": "main"},
|
||||
{"name": "mock-bigorg", "repo": "molecule-ai/molecule-ai-org-template-mock-bigorg", "ref": "main"}
|
||||
{"name": "ux-ab-lab", "repo": "molecule-ai/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
|
||||
]
|
||||
}
|
||||
// Triggered by Integration Tester at 2026-05-10T08:52Z
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def workflow_on(path: Path):
|
||||
doc = yaml.safe_load(path.read_text())
|
||||
return doc.get("on") or doc.get(True)
|
||||
|
||||
|
||||
def test_browser_e2e_workflows_are_not_unconditional_pr_heavy_lanes():
|
||||
workflows = [
|
||||
ROOT / ".gitea/workflows/e2e-chat.yml",
|
||||
ROOT / ".gitea/workflows/e2e-staging-canvas.yml",
|
||||
]
|
||||
|
||||
for path in workflows:
|
||||
text = path.read_text()
|
||||
events = workflow_on(path)
|
||||
|
||||
assert "workflow_dispatch" in events
|
||||
assert "schedule" in events
|
||||
assert "merge-queue" in text
|
||||
assert "/issues/${{ github.event.pull_request.number }}/labels" in text
|
||||
assert "PR is not in merge-queue" in text
|
||||
@@ -38,7 +38,11 @@ set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
WORKFLOWS_DIR="$REPO_ROOT/.github/workflows"
|
||||
# Gitea is the SSOT for CI on molecule-core per task #347 / memory
|
||||
# reference_molecule_core_actions_gitea_only — workflows live in
|
||||
# .gitea/workflows/ exclusively. The legacy .github/workflows/ tree was
|
||||
# deleted in SSOT-Instance-4 (task #331).
|
||||
WORKFLOWS_DIR="$REPO_ROOT/.gitea/workflows"
|
||||
APPLY_SH="$SCRIPT_DIR/apply.sh"
|
||||
|
||||
if [[ ! -f "$APPLY_SH" ]]; then
|
||||
@@ -46,7 +50,7 @@ if [[ ! -f "$APPLY_SH" ]]; then
|
||||
exit 2
|
||||
fi
|
||||
if [[ ! -d "$WORKFLOWS_DIR" ]]; then
|
||||
echo "check_name_parity: missing .github/workflows at $WORKFLOWS_DIR" >&2
|
||||
echo "check_name_parity: missing .gitea/workflows at $WORKFLOWS_DIR" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
|
||||
@@ -33,12 +33,14 @@ trap '[[ -n "$TMPDIR_FOR_CASE" && -d "$TMPDIR_FOR_CASE" ]] && rm -rf "$TMPDIR_FO
|
||||
|
||||
# Build a synthetic repo at $1 with apply.sh listing $2 (one name per
|
||||
# line) as the staging required set + zero main required, then write
|
||||
# whatever .github/workflows/* files the test case adds.
|
||||
# whatever .gitea/workflows/* files the test case adds. (Pre-SSOT-4
|
||||
# this was .github/workflows; molecule-core switched to Gitea-SSOT in
|
||||
# task #331 and the script now reads from .gitea/workflows/.)
|
||||
make_fake_repo() {
|
||||
local root="$1"
|
||||
local checks="$2"
|
||||
mkdir -p "$root/tools/branch-protection"
|
||||
mkdir -p "$root/.github/workflows"
|
||||
mkdir -p "$root/.gitea/workflows"
|
||||
cat > "$root/tools/branch-protection/apply.sh" <<EOF
|
||||
#!/usr/bin/env bash
|
||||
# Stub apply.sh — only the heredoc-shaped check lists matter for the
|
||||
@@ -54,7 +56,7 @@ EOF2
|
||||
EOF
|
||||
chmod +x "$root/tools/branch-protection/apply.sh"
|
||||
# Place the script-under-test alongside its sibling apply.sh so the
|
||||
# script's REPO_ROOT walk finds the synthetic .github/workflows/.
|
||||
# script's REPO_ROOT walk finds the synthetic .gitea/workflows/.
|
||||
cp "$SCRIPT_UNDER_TEST" "$root/tools/branch-protection/check_name_parity.sh"
|
||||
}
|
||||
|
||||
@@ -67,7 +69,7 @@ run_case() {
|
||||
local expected_stderr_substring="$6"
|
||||
TMPDIR_FOR_CASE=$(mktemp -d)
|
||||
make_fake_repo "$TMPDIR_FOR_CASE" "$checks"
|
||||
printf '%s' "$workflow_yaml" > "$TMPDIR_FOR_CASE/.github/workflows/$workflow_filename"
|
||||
printf '%s' "$workflow_yaml" > "$TMPDIR_FOR_CASE/.gitea/workflows/$workflow_filename"
|
||||
local stderr_file
|
||||
stderr_file=$(mktemp)
|
||||
local actual_exit=0
|
||||
|
||||
@@ -0,0 +1,143 @@
|
||||
package db
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Regression pin for RFC internal#617 / task #335.
|
||||
//
|
||||
// The drop-runtime_image_pins migration MUST honor the care zone documented
|
||||
// in the RFC: drop the `runtime_image_pins` table but PRESERVE the column
|
||||
// `workspaces.runtime_image_digest` and its partial index
|
||||
// `idx_workspaces_runtime_image_digest`.
|
||||
//
|
||||
// This is a static-file lint, not a DB-execution test. Running the actual
|
||||
// migration is out of scope for unit tests (the migration test infra in
|
||||
// postgres_schema_migrations_test.go already proves the apply mechanism
|
||||
// works for any forward file). What we pin here is the *content shape* of
|
||||
// the new migration:
|
||||
//
|
||||
// - up.sql DROPs runtime_image_pins (the dead table)
|
||||
// - up.sql does NOT touch runtime_image_digest (the care-zone column)
|
||||
// - up.sql does NOT touch idx_workspaces_runtime_image_digest (care-zone index)
|
||||
// - down.sql recreates runtime_image_pins (idempotent rollback)
|
||||
//
|
||||
// If a future cleanup PR wants to also drop the column, it should be a
|
||||
// separate migration with its own RFC — this test catches accidental
|
||||
// scope creep at PR time, before it ships to tenant DBs.
|
||||
func TestMigration20260520_DropsRuntimeImagePins_PreservesDigestColumn(t *testing.T) {
|
||||
// Locate the migrations dir relative to this test file's package dir.
|
||||
// /workspace-server/internal/db/ → ../../migrations/
|
||||
const migDir = "../../migrations"
|
||||
const upFile = "20260520120000_drop_runtime_image_pins.up.sql"
|
||||
const downFile = "20260520120000_drop_runtime_image_pins.down.sql"
|
||||
|
||||
upPath := filepath.Join(migDir, upFile)
|
||||
downPath := filepath.Join(migDir, downFile)
|
||||
|
||||
upBytes, err := os.ReadFile(upPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read %s: %v", upPath, err)
|
||||
}
|
||||
downBytes, err := os.ReadFile(downPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read %s: %v", downPath, err)
|
||||
}
|
||||
|
||||
// Strip single-line SQL comments (`-- ...`) before assertion so the
|
||||
// rationale prose in the migration headers can mention the care-zone
|
||||
// column by name without tripping the DDL-touch guard. The guard is
|
||||
// specifically about DDL statements that act on the column.
|
||||
upDDL := stripSQLLineComments(strings.ToLower(string(upBytes)))
|
||||
downDDL := stripSQLLineComments(strings.ToLower(string(downBytes)))
|
||||
|
||||
// up.sql MUST drop the dead table.
|
||||
if !strings.Contains(upDDL, "drop table") || !strings.Contains(upDDL, "runtime_image_pins") {
|
||||
t.Errorf("up.sql must DROP TABLE runtime_image_pins; got DDL:\n%s\n(full file:\n%s)", upDDL, upBytes)
|
||||
}
|
||||
|
||||
// CARE ZONE: up.sql DDL MUST NOT touch the workspaces.runtime_image_digest
|
||||
// column or its index. A DDL statement that references either name is a
|
||||
// scope-creep defect — file a separate RFC.
|
||||
if strings.Contains(upDDL, "runtime_image_digest") {
|
||||
t.Errorf("up.sql DDL references runtime_image_digest — care-zone column must NOT be touched by this migration. See RFC internal#617 §3. DDL:\n%s\n(full file:\n%s)", upDDL, upBytes)
|
||||
}
|
||||
if strings.Contains(upDDL, "idx_workspaces_runtime_image_digest") {
|
||||
t.Errorf("up.sql DDL references idx_workspaces_runtime_image_digest — care-zone index must NOT be touched by this migration. See RFC internal#617 §3. DDL:\n%s\n(full file:\n%s)", upDDL, upBytes)
|
||||
}
|
||||
|
||||
// down.sql MUST recreate the table (rollback path).
|
||||
if !strings.Contains(downDDL, "create table") || !strings.Contains(downDDL, "runtime_image_pins") {
|
||||
t.Errorf("down.sql must CREATE TABLE runtime_image_pins (rollback path); got DDL:\n%s\n(full file:\n%s)", downDDL, downBytes)
|
||||
}
|
||||
|
||||
// down.sql DDL also must not touch the care-zone column (symmetry —
|
||||
// we never added the column in the up so we cannot drop or recreate it
|
||||
// in the down either).
|
||||
if strings.Contains(downDDL, "runtime_image_digest") {
|
||||
t.Errorf("down.sql DDL references runtime_image_digest — should be a no-op for the care-zone column. DDL:\n%s\n(full file:\n%s)", downDDL, downBytes)
|
||||
}
|
||||
}
|
||||
|
||||
// stripSQLLineComments removes `-- ...` line comments from a SQL string,
|
||||
// leaving only DDL statements + whitespace. Used by the migration-content
|
||||
// guards so descriptive prose in the migration header doesn't false-flag.
|
||||
//
|
||||
// This is intentionally minimal — does NOT handle `/* */` block comments
|
||||
// (the migration files don't use them) or string-literal embedded `--`
|
||||
// (DDL doesn't use that shape). Good enough for static-content lint.
|
||||
func stripSQLLineComments(s string) string {
|
||||
lines := strings.Split(s, "\n")
|
||||
out := make([]string, 0, len(lines))
|
||||
for _, ln := range lines {
|
||||
// Trim everything after the first `--`. Conservative — if a future
|
||||
// migration genuinely needs `--` inside a string literal, that
|
||||
// would require parsing.
|
||||
if idx := strings.Index(ln, "--"); idx >= 0 {
|
||||
ln = ln[:idx]
|
||||
}
|
||||
out = append(out, ln)
|
||||
}
|
||||
return strings.Join(out, "\n")
|
||||
}
|
||||
|
||||
// TestMigration20260520_PairExists is a belt-and-braces guard that the
|
||||
// up + down files both exist and aren't empty. RunMigrations only consumes
|
||||
// the up but a missing down breaks the dev-side rollback workflow silently.
|
||||
func TestMigration20260520_PairExists(t *testing.T) {
|
||||
const migDir = "../../migrations"
|
||||
for _, f := range []string{
|
||||
"20260520120000_drop_runtime_image_pins.up.sql",
|
||||
"20260520120000_drop_runtime_image_pins.down.sql",
|
||||
} {
|
||||
p := filepath.Join(migDir, f)
|
||||
info, err := os.Stat(p)
|
||||
if err != nil {
|
||||
t.Errorf("expected migration file %s to exist: %v", p, err)
|
||||
continue
|
||||
}
|
||||
if info.Size() < 50 {
|
||||
t.Errorf("migration file %s is suspiciously small (%d bytes) — header comment missing?", p, info.Size())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestMigration20260520_DeadReaderIsGone pins the deletion of the dead
|
||||
// runtime_image_pin.go reader. If anyone reintroduces it (e.g., a cherry-
|
||||
// pick from a stale branch), this catches it in unit tests before it hits
|
||||
// review. The reader is provably dead under CP-as-SSOT — re-adding it
|
||||
// reopens the divergence the RFC closed.
|
||||
func TestMigration20260520_DeadReaderIsGone(t *testing.T) {
|
||||
const readerPath = "../handlers/runtime_image_pin.go"
|
||||
if _, err := os.Stat(readerPath); err == nil {
|
||||
t.Errorf("dead reader %s reappeared — RFC internal#617 retired it. If you really need a per-tenant pin path, file a follow-up RFC; do not just re-add the reader.", readerPath)
|
||||
}
|
||||
const testPath = "../handlers/runtime_image_pin_test.go"
|
||||
if _, err := os.Stat(testPath); err == nil {
|
||||
t.Errorf("dead reader test %s reappeared — should have been removed alongside the implementation.", testPath)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,6 +49,7 @@ import (
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/uploads"
|
||||
)
|
||||
|
||||
// ChatFilesHandler serves file upload + download for chat. Holds a
|
||||
@@ -112,19 +113,26 @@ func (h *ChatFilesHandler) WithPendingUploads(storage pendinguploads.Storage, br
|
||||
}
|
||||
|
||||
// chatUploadMaxBytes caps the full multipart request body so a
|
||||
// malicious / runaway client can't OOM the proxy hop. 100 MB matches
|
||||
// the workspace-side limit; anything larger is rejected at the
|
||||
// network boundary before forwarding.
|
||||
// malicious / runaway client can't OOM the proxy hop. Derived from the
|
||||
// SSOT in internal/uploads (task #320): bumping the cap is one edit in
|
||||
// internal/uploads/limits.go, never a synchronized 5-surface bump.
|
||||
//
|
||||
// CANVAS_MIRROR: keep aligned with canvas/src/components/tabs/chat/
|
||||
// uploads.ts MAX_UPLOAD_BYTES. The canvas constant exists so the
|
||||
// pre-flight size check can fail immediately (before network I/O)
|
||||
// with the actionable "File too large (got X MB) — limit is 100MB"
|
||||
// message. Bumping one side without the other yields the wrong-reason
|
||||
// surface that motivated this constant pair (CTO 2026-05-19 directive
|
||||
// on forensic a99ab0a1: file-size cause MUST surface as file-size,
|
||||
// NOT as a downstream timeout).
|
||||
const chatUploadMaxBytes = 100 * 1024 * 1024
|
||||
// CANVAS_MIRROR: canvas/src/components/tabs/chat/uploads.ts reads the
|
||||
// live value via GET /uploads/limits at app init (Phase 2 follow-up;
|
||||
// today still a literal 100 MB pinned by an assertion test, which the
|
||||
// migration PR will replace). The canvas constant exists so the
|
||||
// pre-flight size check can fail immediately (before network I/O) with
|
||||
// the actionable "File too large (got X MB) — limit is 100MB" message.
|
||||
// Bumping one side without the other yields the wrong-reason surface
|
||||
// that motivated this constant pair (CTO 2026-05-19 directive on
|
||||
// forensic a99ab0a1: file-size cause MUST surface as file-size, NOT as
|
||||
// a downstream timeout). Once the canvas migrates to the live fetch,
|
||||
// drift becomes structurally impossible.
|
||||
//
|
||||
// Why "var" instead of "const": Go disallows initializing a const from
|
||||
// a function call. The DefaultUploadLimits() body is pure literals so
|
||||
// the runtime cost is zero.
|
||||
var chatUploadMaxBytes = uploads.DefaultUploadLimits().PerRequestBytes
|
||||
|
||||
// resolveWorkspaceForwardCreds resolves the workspace's URL +
|
||||
// platform_inbound_secret for an /internal/* forward, applying
|
||||
@@ -620,7 +628,7 @@ func (h *ChatFilesHandler) uploadPollMode(c *gin.Context, ctx context.Context, w
|
||||
prepReady := make([]prepped, 0, len(headers))
|
||||
items := make([]pendinguploads.PutItem, 0, len(headers))
|
||||
for _, fh := range headers {
|
||||
if fh.Size > pendinguploads.MaxFileBytes {
|
||||
if fh.Size > int64(pendinguploads.MaxFileBytes) {
|
||||
log.Printf("chat_files uploadPollMode: per-file cap exceeded for %s: %s (%d bytes)",
|
||||
workspaceID, fh.Filename, fh.Size)
|
||||
c.JSON(http.StatusRequestEntityTooLarge, gin.H{
|
||||
|
||||
@@ -586,7 +586,7 @@ func TestPollUpload_PerFileCapPreStorage_413(t *testing.T) {
|
||||
// next bumped above MaxFileBytes (e.g., RFC for the SSOT
|
||||
// GET /uploads/limits endpoint reshapes the layering) this test
|
||||
// can run again.
|
||||
if pendinguploads.MaxFileBytes >= chatUploadMaxBytes {
|
||||
if int64(pendinguploads.MaxFileBytes) >= chatUploadMaxBytes {
|
||||
t.Skipf("per-file cap %d >= body cap %d; the body MaxBytesReader 400s before the per-file 413 branch is reachable. Re-enable when body cap > per-file cap.",
|
||||
pendinguploads.MaxFileBytes, chatUploadMaxBytes)
|
||||
}
|
||||
@@ -686,7 +686,7 @@ func TestPollUpload_AtomicRollbackOnSecondFileTooLarge(t *testing.T) {
|
||||
// a real Postgres without depending on the body-cap arithmetic
|
||||
// here. Re-enable this handler-level test when body cap exceeds
|
||||
// per-file cap again.
|
||||
if pendinguploads.MaxFileBytes >= chatUploadMaxBytes {
|
||||
if int64(pendinguploads.MaxFileBytes) >= chatUploadMaxBytes {
|
||||
t.Skipf("per-file cap %d >= body cap %d; the body MaxBytesReader 400s before the per-file 413 branch is reachable. Storage-level atomicity covered by integration test. Re-enable when body cap > per-file cap.",
|
||||
pendinguploads.MaxFileBytes, chatUploadMaxBytes)
|
||||
}
|
||||
|
||||
@@ -122,8 +122,22 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
|
||||
|
||||
// #548 — prevent self-delegation: a workspace delegating to itself
|
||||
// acquires _run_lock twice on the same mutex, deadlocking permanently.
|
||||
//
|
||||
// #383 — the error message is the agent-visible string when this 400
|
||||
// fires on the SDK's _delegate_sync_via_polling path. The previous
|
||||
// terse "self-delegation not permitted" was correct but indistinct
|
||||
// from a transient rate-limit or auth failure, so the LLM would
|
||||
// re-attempt every 2-3s in a tight loop (chloe-dong tenant external
|
||||
// workspace, 2026-05-20). The expanded message is explicit about
|
||||
// (a) what just happened, (b) why it cannot succeed, (c) what to do
|
||||
// instead — so the agent's retry heuristic recognizes the path as
|
||||
// terminal and stops.
|
||||
if sourceID == body.TargetID {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "self-delegation not permitted"})
|
||||
c.JSON(http.StatusBadRequest, gin.H{
|
||||
"error": "self-delegation not permitted",
|
||||
"reason": "the source workspace and target workspace are the same; you cannot delegate a task to yourself",
|
||||
"hint": "do the work yourself, or pick a different peer via list_peers — retrying with the same target_id will fail every time",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -256,24 +256,43 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
|
||||
peers = append(peers, siblings...)
|
||||
}
|
||||
|
||||
// Children
|
||||
// Children — exclude self defensively. A child row whose parent_id
|
||||
// equals the requesting workspaceID can never legitimately be the
|
||||
// caller (a workspace can't be its own child), but a future data-
|
||||
// integrity defect (e.g. self-loop introduced by a buggy register
|
||||
// path) would otherwise smuggle the caller back into its own peer
|
||||
// list. The agent then attempts `delegate_task(<own_id>)`, which
|
||||
// either deadlocks _run_lock (sync path) or hits the platform's
|
||||
// self-delegation 400 in a tight loop (#383). The `w.id != $2`
|
||||
// clause makes self-delegation-via-peer-list impossible regardless
|
||||
// of DB state.
|
||||
children, _ := queryPeerMaps(`
|
||||
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
|
||||
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
|
||||
w.parent_id, w.active_tasks
|
||||
FROM workspaces w WHERE w.parent_id = $1 AND w.status != 'removed'`, workspaceID)
|
||||
FROM workspaces w WHERE w.parent_id = $1 AND w.id != $2 AND w.status != 'removed'`,
|
||||
workspaceID, workspaceID)
|
||||
peers = append(peers, children...)
|
||||
|
||||
// Parent
|
||||
// Parent — same defense-in-depth. A workspace whose parent_id points
|
||||
// to itself is data corruption, but the peer-list endpoint must not
|
||||
// propagate that corruption back to the agent as a "peer who is also
|
||||
// you" entry.
|
||||
if parentID.Valid {
|
||||
parent, _ := queryPeerMaps(`
|
||||
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
|
||||
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
|
||||
w.parent_id, w.active_tasks
|
||||
FROM workspaces w WHERE w.id = $1 AND w.status != 'removed'`, parentID.String)
|
||||
FROM workspaces w WHERE w.id = $1 AND w.id != $2 AND w.status != 'removed'`,
|
||||
parentID.String, workspaceID)
|
||||
peers = append(peers, parent...)
|
||||
}
|
||||
|
||||
// #383 final-line defense: even if a future code path adds a query
|
||||
// that doesn't filter self, strip the caller's own row before
|
||||
// returning. Cheap O(n) over a peer set bounded at <50 rows.
|
||||
peers = excludeSelfFromPeers(peers, workspaceID)
|
||||
|
||||
peers = filterPeersByQuery(peers, c.Query("q"))
|
||||
|
||||
if peers == nil {
|
||||
@@ -282,6 +301,32 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, peers)
|
||||
}
|
||||
|
||||
// excludeSelfFromPeers strips any peer entry whose ``id`` equals
|
||||
// ``workspaceID`` (the caller's own row). Final-line defense for #383
|
||||
// (self-delegation 400-loop on external workspaces): a peer-list that
|
||||
// includes the requester's own row is the root mechanism by which an
|
||||
// agent ends up delegating to itself. The pre-DB filters in Peers
|
||||
// already enforce `w.id != $caller` on each branch; this function
|
||||
// guarantees the contract holds regardless of which query path
|
||||
// returned the row, including future ones added without a self-filter.
|
||||
//
|
||||
// O(n) over a peer set bounded at <50 rows per `Peers` comment — well
|
||||
// below the hot-path overhead of the existing filterPeersByQuery.
|
||||
func excludeSelfFromPeers(peers []map[string]interface{}, workspaceID string) []map[string]interface{} {
|
||||
if len(peers) == 0 {
|
||||
return peers
|
||||
}
|
||||
out := make([]map[string]interface{}, 0, len(peers))
|
||||
for _, p := range peers {
|
||||
id, _ := p["id"].(string)
|
||||
if id == workspaceID {
|
||||
continue
|
||||
}
|
||||
out = append(out, p)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// filterPeersByQuery returns peers whose name or role case-insensitively
|
||||
// contains q. Whitespace-trimmed empty q is a no-op (returns input unchanged).
|
||||
func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} {
|
||||
|
||||
@@ -125,14 +125,14 @@ func TestPeers_WithParent(t *testing.T) {
|
||||
WillReturnRows(sqlmock.NewRows(peerCols).
|
||||
AddRow("ws-sibling-2", "Sibling Two", "worker", 1, "online", []byte("null"), "http://localhost:8002", "ws-parent", 0))
|
||||
|
||||
// Expect children query
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status").
|
||||
WithArgs("ws-sibling-1").
|
||||
// Expect children query — #383 added explicit `w.id != $2` self-filter
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs("ws-sibling-1", "ws-sibling-1").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols))
|
||||
|
||||
// Expect parent query
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status").
|
||||
WithArgs("ws-parent").
|
||||
// Expect parent query — #383 added explicit `w.id != $2` self-filter
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs("ws-parent", "ws-sibling-1").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols).
|
||||
AddRow("ws-parent", "Parent PM", "manager", 2, "online", []byte("null"), "http://localhost:8001", nil, 1))
|
||||
|
||||
@@ -228,9 +228,9 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) {
|
||||
WithArgs("ws-root-alone").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols))
|
||||
|
||||
// Children — none
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1").
|
||||
WithArgs("ws-root-alone").
|
||||
// Children — none. #383 added explicit `w.id != $2` self-filter.
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2").
|
||||
WithArgs("ws-root-alone", "ws-root-alone").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols))
|
||||
|
||||
// No parent query since parent_id is NULL
|
||||
@@ -282,12 +282,14 @@ func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
|
||||
AddRow("ws-alpha", "Alpha Researcher", "researcher", 1, "online", []byte("null"), "http://a", "ws-pm", 0).
|
||||
AddRow("ws-beta", "Beta Designer", "designer", 1, "online", []byte("null"), "http://b", "ws-pm", 0))
|
||||
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status").
|
||||
WithArgs("ws-self").
|
||||
// #383 — children query gained explicit `w.id != $2` self-filter.
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs("ws-self", "ws-self").
|
||||
WillReturnRows(sqlmock.NewRows(cols))
|
||||
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status").
|
||||
WithArgs("ws-pm").
|
||||
// #383 — parent query gained explicit `w.id != $2` self-filter.
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs("ws-pm", "ws-self").
|
||||
WillReturnRows(sqlmock.NewRows(cols).
|
||||
AddRow("ws-pm", "PM Workspace", "manager", 2, "online", []byte("null"), "http://pm", nil, 1))
|
||||
|
||||
@@ -966,8 +968,9 @@ func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) {
|
||||
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id IS NULL AND w.id").
|
||||
WithArgs("ws-dev").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols))
|
||||
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.status").
|
||||
WithArgs("ws-dev").
|
||||
// #383 — children query gained explicit `w.id != $2` self-filter.
|
||||
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs("ws-dev", "ws-dev").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
@@ -1030,3 +1033,183 @@ func TestPeers_DevModeFailOpen_ClosedInProduction(t *testing.T) {
|
||||
t.Fatalf("expected 401 in production, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== Peers — #383 self never appears in result ====================
|
||||
|
||||
// TestPeers_ExcludeSelf_DefenseInDepth verifies the final-line filter in
|
||||
// Peers strips any row whose id matches the caller. The pre-DB SQL filters
|
||||
// already do this, but a future code path that omits the `w.id != $caller`
|
||||
// clause must not be able to smuggle a self-row through. This test
|
||||
// simulates that future-defect case by mocking the children query to
|
||||
// (incorrectly) return a row whose id matches the caller, and asserts the
|
||||
// final filter still drops it.
|
||||
//
|
||||
// Root cause class for #383: an agent that sees its own row in /peers
|
||||
// proceeds to delegate_task to itself, hitting the platform's
|
||||
// self-delegation 400 in a tight loop. The fix in discovery.go is
|
||||
// defense-in-depth: even if the SQL filter regresses, this handler-level
|
||||
// filter prevents the 400-loop from materializing.
|
||||
func TestPeers_ExcludeSelf_DefenseInDepth(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
const selfID = "ws-xiaodong"
|
||||
|
||||
// parent_id lookup — workspace has a parent.
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs(selfID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-parent"))
|
||||
|
||||
peerCols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}
|
||||
|
||||
// Siblings — returns one legitimate sibling. The SQL filter excludes
|
||||
// self at the source.
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2").
|
||||
WithArgs("ws-parent", selfID).
|
||||
WillReturnRows(sqlmock.NewRows(peerCols).
|
||||
AddRow("ws-sibling", "Sibling", "worker", 1, "online", []byte("null"), "http://localhost:8002", "ws-parent", 0))
|
||||
|
||||
// Children — simulates the data-integrity defect class: the DB
|
||||
// (incorrectly) returns the caller's own row in the children set.
|
||||
// In real production this would require a workspace whose
|
||||
// parent_id points to itself — corruption only, but the handler
|
||||
// must not propagate it.
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs(selfID, selfID).
|
||||
WillReturnRows(sqlmock.NewRows(peerCols).
|
||||
AddRow(selfID, "Self As Child", "worker", 1, "online", []byte("null"), "http://localhost:8001", selfID, 0).
|
||||
AddRow("ws-child", "Real Child", "worker", 1, "online", []byte("null"), "http://localhost:8003", selfID, 0))
|
||||
|
||||
// Parent — explicit `w.id != $2` clause so the parent path is also
|
||||
// self-filtered. parentID.String = "ws-parent" != selfID, so the
|
||||
// row is included.
|
||||
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs("ws-parent", selfID).
|
||||
WillReturnRows(sqlmock.NewRows(peerCols).
|
||||
AddRow("ws-parent", "Parent", "manager", 2, "online", []byte("null"), "http://localhost:8004", nil, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: selfID}}
|
||||
c.Request = httptest.NewRequest("GET", "/registry/"+selfID+"/peers", nil)
|
||||
|
||||
handler.Peers(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var peers []map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &peers); err != nil {
|
||||
t.Fatalf("failed to parse response: %v", err)
|
||||
}
|
||||
|
||||
// The defense-in-depth filter must drop the self row even though
|
||||
// the (mocked-defective) children query returned it.
|
||||
for _, p := range peers {
|
||||
if id, _ := p["id"].(string); id == selfID {
|
||||
t.Fatalf("peer list contains caller's own id %q — self-delegation defense regressed; full list: %+v", selfID, peers)
|
||||
}
|
||||
}
|
||||
|
||||
// Sanity: the three legitimate peers (sibling, real child, parent)
|
||||
// must all be present. Catches an over-aggressive filter that
|
||||
// strips legitimate rows.
|
||||
expectedIDs := map[string]bool{"ws-sibling": false, "ws-child": false, "ws-parent": false}
|
||||
for _, p := range peers {
|
||||
if id, _ := p["id"].(string); id != "" {
|
||||
if _, ok := expectedIDs[id]; ok {
|
||||
expectedIDs[id] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
for id, found := range expectedIDs {
|
||||
if !found {
|
||||
t.Errorf("legitimate peer %q missing from response; got %+v", id, peers)
|
||||
}
|
||||
}
|
||||
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestExcludeSelfFromPeers_Unit exercises the helper directly so the
|
||||
// defense-in-depth contract is asserted independently of SQL mocking.
|
||||
// Pure-function tests run in microseconds and pin the filter shape
|
||||
// (empty input, no-match passthrough, single-row drop, multi-row drop,
|
||||
// preserves order) so future edits to the helper can't silently
|
||||
// regress to "returns input unchanged".
|
||||
func TestExcludeSelfFromPeers_Unit(t *testing.T) {
|
||||
t.Run("empty input returns empty slice", func(t *testing.T) {
|
||||
out := excludeSelfFromPeers(nil, "ws-self")
|
||||
if len(out) != 0 {
|
||||
t.Errorf("expected empty, got %+v", out)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("no self in list passes through unchanged", func(t *testing.T) {
|
||||
in := []map[string]interface{}{
|
||||
{"id": "ws-a", "name": "A"},
|
||||
{"id": "ws-b", "name": "B"},
|
||||
}
|
||||
out := excludeSelfFromPeers(in, "ws-self")
|
||||
if len(out) != 2 {
|
||||
t.Fatalf("expected 2, got %d (%+v)", len(out), out)
|
||||
}
|
||||
if out[0]["id"] != "ws-a" || out[1]["id"] != "ws-b" {
|
||||
t.Errorf("order not preserved: %+v", out)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("self row dropped, others preserved", func(t *testing.T) {
|
||||
in := []map[string]interface{}{
|
||||
{"id": "ws-a", "name": "A"},
|
||||
{"id": "ws-self", "name": "Me"},
|
||||
{"id": "ws-b", "name": "B"},
|
||||
}
|
||||
out := excludeSelfFromPeers(in, "ws-self")
|
||||
if len(out) != 2 {
|
||||
t.Fatalf("expected 2, got %d (%+v)", len(out), out)
|
||||
}
|
||||
if out[0]["id"] != "ws-a" || out[1]["id"] != "ws-b" {
|
||||
t.Errorf("expected [ws-a, ws-b], got %+v", out)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("multiple self rows all dropped", func(t *testing.T) {
|
||||
// Pathological — should never happen, but the contract is
|
||||
// "no row with id==workspaceID survives", not "at most one
|
||||
// such row is dropped". Pin it.
|
||||
in := []map[string]interface{}{
|
||||
{"id": "ws-self", "name": "Me1"},
|
||||
{"id": "ws-a", "name": "A"},
|
||||
{"id": "ws-self", "name": "Me2"},
|
||||
}
|
||||
out := excludeSelfFromPeers(in, "ws-self")
|
||||
if len(out) != 1 {
|
||||
t.Fatalf("expected 1, got %d (%+v)", len(out), out)
|
||||
}
|
||||
if out[0]["id"] != "ws-a" {
|
||||
t.Errorf("expected [ws-a], got %+v", out)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("row with missing id key is preserved (not a self-collision)", func(t *testing.T) {
|
||||
// A peer row with no "id" key shouldn't be silently dropped
|
||||
// by the self-filter — it's a malformed row class that
|
||||
// belongs to a different defect.
|
||||
in := []map[string]interface{}{
|
||||
{"name": "no-id-row"},
|
||||
{"id": "ws-self", "name": "Me"},
|
||||
}
|
||||
out := excludeSelfFromPeers(in, "ws-self")
|
||||
if len(out) != 1 {
|
||||
t.Fatalf("expected 1, got %d (%+v)", len(out), out)
|
||||
}
|
||||
if out[0]["name"] != "no-id-row" {
|
||||
t.Errorf("expected no-id-row preserved, got %+v", out)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -432,9 +432,10 @@ func TestExtended_Peers(t *testing.T) {
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}).
|
||||
AddRow("ws-sibling", "Sibling Agent", "worker", 1, "online", []byte("null"), "http://localhost:9001", nil, 0))
|
||||
|
||||
// Expect children query (workspaces with parent_id = ws-peer)
|
||||
// Expect children query (workspaces with parent_id = ws-peer, excluding self)
|
||||
// Query now binds (parent_id, self_id) for the self-filter guard added in #383.
|
||||
mock.ExpectQuery("SELECT w.id, w.name").
|
||||
WithArgs("ws-peer").
|
||||
WithArgs("ws-peer", "ws-peer").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}))
|
||||
|
||||
// No parent query since workspace is root-level
|
||||
|
||||
@@ -416,10 +416,9 @@ func TestWorkspaceCreate(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestBuildProvisionerConfig_IncludesAwarenessSettings(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
setupTestDB(t)
|
||||
// runtime_image_pins reader removed by RFC internal#617 / task #335
|
||||
// — CP is the SSOT for runtime image pins. No DB lookup here anymore.
|
||||
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", "/tmp/configs")
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
)
|
||||
|
||||
// resolveRuntimeImage returns the digest-pinned image ref for a runtime when
|
||||
// an operator has promoted one via the runtime_image_pins table (#2272 layer 1),
|
||||
// otherwise "" so the caller falls back to the legacy `:latest` lookup in
|
||||
// provisioner.RuntimeImages.
|
||||
//
|
||||
// Policy: availability over pinning. Any DB hiccup (sql.ErrNoRows is the
|
||||
// steady-state when nothing is pinned, but transient network blips, table
|
||||
// missing post-rollback, etc.) returns "" and the provision continues on
|
||||
// the moving tag — better one workspace on a slightly-newer image than a
|
||||
// provision-blocked tenant.
|
||||
//
|
||||
// WORKSPACE_IMAGE_LOCAL_OVERRIDE=1 short-circuits the lookup entirely so a
|
||||
// developer rebuilding template images locally gets their fresh build via
|
||||
// `:latest` even when a remote digest is pinned for the same runtime.
|
||||
func resolveRuntimeImage(ctx context.Context, runtime string) string {
|
||||
if runtime == "" {
|
||||
return ""
|
||||
}
|
||||
base, ok := provisioner.RuntimeImages[runtime]
|
||||
if !ok {
|
||||
// Unknown runtime — let provisioner.Start fall through to its own
|
||||
// DefaultImage. Querying the pin table for a runtime that doesn't
|
||||
// exist would only produce noise and a guaranteed ErrNoRows.
|
||||
return ""
|
||||
}
|
||||
if os.Getenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE") != "" {
|
||||
return ""
|
||||
}
|
||||
if db.DB == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var digest string
|
||||
err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT digest FROM runtime_image_pins WHERE template_name = $1`, runtime,
|
||||
).Scan(&digest)
|
||||
if err != nil {
|
||||
if !errors.Is(err, sql.ErrNoRows) {
|
||||
log.Printf("resolveRuntimeImage: pin lookup for %q failed (%v) — falling back to :latest", runtime, err)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Strip the moving tag suffix (`:latest`, `:staging`) before appending
|
||||
// the immutable digest. Docker treats `name:tag@sha256:...` as valid
|
||||
// but the tag is ignored; dropping it keeps logs and admin diffs honest
|
||||
// about what's actually being pulled.
|
||||
pinned := base
|
||||
if idx := strings.LastIndex(pinned, ":"); idx > strings.LastIndex(pinned, "/") {
|
||||
pinned = pinned[:idx]
|
||||
}
|
||||
return pinned + "@" + digest
|
||||
}
|
||||
@@ -1,138 +0,0 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
||||
)
|
||||
|
||||
// TestResolveRuntimeImage_NoPin: lookup returns sql.ErrNoRows (the steady-
|
||||
// state when an operator hasn't pinned this runtime). resolveRuntimeImage
|
||||
// returns "" so the caller falls back to RuntimeImages[runtime] (legacy
|
||||
// :latest behavior). This is the expected hot path until digest pinning
|
||||
// is opted into per runtime.
|
||||
func TestResolveRuntimeImage_NoPin(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty (no pin = fallback), got %q", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_DBError: an unexpected DB failure (transient
|
||||
// network blip, table missing post-rollback, etc.) must NOT block the
|
||||
// provision — log + fall through to the legacy :latest path. This is
|
||||
// the availability-over-pinning policy spelled out in resolveRuntimeImage's
|
||||
// doc comment.
|
||||
func TestResolveRuntimeImage_DBError(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnError(sqlmock.ErrCancelled)
|
||||
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty on DB error (fallback), got %q", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_WithPin returns image@sha256:<digest> when row exists.
|
||||
func TestResolveRuntimeImage_WithPin(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
digest := "sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"digest"}).AddRow(digest))
|
||||
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if !strings.HasSuffix(got, "@"+digest) {
|
||||
t.Errorf("expected suffix @%s, got %q", digest, got)
|
||||
}
|
||||
if !strings.HasPrefix(got, "ghcr.io/molecule-ai/workspace-template-claude-code") {
|
||||
t.Errorf("expected GHCR prefix preserved, got %q", got)
|
||||
}
|
||||
if strings.Contains(got, ":latest") {
|
||||
t.Errorf("expected :latest stripped, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_EmptyRuntime short-circuits to "" without DB.
|
||||
func TestResolveRuntimeImage_EmptyRuntime(t *testing.T) {
|
||||
got := resolveRuntimeImage(context.Background(), "")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty for empty runtime, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_UnknownRuntime returns "" without DB lookup.
|
||||
func TestResolveRuntimeImage_UnknownRuntime(t *testing.T) {
|
||||
got := resolveRuntimeImage(context.Background(), "no-such-runtime")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty for unknown runtime, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_LocalOverride: when WORKSPACE_IMAGE_LOCAL_OVERRIDE
|
||||
// is set, the pin lookup is short-circuited even with a row present —
|
||||
// devs rebuild images locally and want the floating tag to resolve to
|
||||
// their fresh build, not a remote-pinned digest.
|
||||
func TestResolveRuntimeImage_LocalOverride(t *testing.T) {
|
||||
t.Setenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE", "1")
|
||||
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
// No expectation set — if resolveRuntimeImage queries the DB despite
|
||||
// the override, sqlmock fails the test via ExpectationsWereMet.
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty under WORKSPACE_IMAGE_LOCAL_OVERRIDE=1, got %q", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("DB queried despite override: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -234,9 +234,13 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
|
||||
"source": "ec2-ssh",
|
||||
})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
|
||||
// in a burst; without this each PUT chains into the
|
||||
// coalesceRestart drain loop. The helper still uses goAsync
|
||||
// internally (drains via h.wh.waitAsyncForTest), preserving
|
||||
// RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -270,9 +274,13 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
|
||||
"source": "container",
|
||||
})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
|
||||
// in a burst; without this each PUT chains into the
|
||||
// coalesceRestart drain loop. The helper still uses goAsync
|
||||
// internally (drains via h.wh.waitAsyncForTest), preserving
|
||||
// RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -292,8 +300,12 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": "replaced", "workspace": workspaceID, "files": len(body.Files), "source": "volume"})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
|
||||
// in a burst; without this each PUT chains into the
|
||||
// coalesceRestart drain loop. The helper still uses goAsync
|
||||
// internally (drains via h.wh.waitAsyncForTest), preserving
|
||||
// RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -570,9 +570,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -586,9 +590,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -602,9 +610,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -657,9 +669,13 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -677,9 +693,13 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -692,8 +712,12 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,7 +261,14 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
|
||||
workspaceAccess := payload.WorkspaceAccess
|
||||
if (workspacePath == "" || workspaceAccess == "") && db.DB != nil {
|
||||
var dbDir, dbAccess string
|
||||
if err := db.DB.QueryRow(
|
||||
// QueryRowContext (not QueryRow) so the provision-timeout ctx
|
||||
// propagates here too. Previously ctx flowed in only to be passed
|
||||
// to resolveRuntimeImage; that dead reader was removed by
|
||||
// RFC internal#617 / task #335. Wiring ctx into the surviving DB
|
||||
// query keeps the parameter load-bearing and is a small correctness
|
||||
// nudge (a 10s ProvisionTimeout now actually bounds this lookup).
|
||||
if err := db.DB.QueryRowContext(
|
||||
ctx,
|
||||
`SELECT COALESCE(workspace_dir, ''), COALESCE(workspace_access, 'none') FROM workspaces WHERE id = $1`,
|
||||
workspaceID,
|
||||
).Scan(&dbDir, &dbAccess); err == nil {
|
||||
@@ -293,7 +300,15 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
|
||||
PlatformURL: h.platformURL,
|
||||
AwarenessURL: os.Getenv("AWARENESS_URL"),
|
||||
AwarenessNamespace: awarenessNamespace,
|
||||
Image: resolveRuntimeImage(ctx, payload.Runtime),
|
||||
// Image left empty — molecule-core's runtime_image_pins table (mig
|
||||
// 047, dead reader removed by RFC internal#617 / task #335) was an
|
||||
// aspirational SSOT that never received a writer. CP's
|
||||
// runtime_image_pins (CP migration 027) is the single SSOT; the
|
||||
// pin is applied at CP's provisioner layer before this code path
|
||||
// runs. Empty here means selectImage() falls back to the legacy
|
||||
// RuntimeImages[Runtime] :latest lookup, which is what the dead
|
||||
// reader's sql.ErrNoRows path was producing already.
|
||||
Image: "",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -854,14 +869,31 @@ func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) {
|
||||
// Returns nil map + error string on decrypt failure. Shared by both Docker
|
||||
// and control plane provisioning paths to avoid duplication.
|
||||
//
|
||||
// The second return value (globalKeys) records which keys originated from
|
||||
// the operator-controlled `global_secrets` table — used by RFC#523 Layer 1
|
||||
// to constrain its forbidden-key check to the operator-bleed channel,
|
||||
// instead of blanket-blocking by name across BOTH provenance channels (the
|
||||
// over-fire that breaks the legitimate user flow of pasting their own
|
||||
// GitHub PAT into the canvas Secrets tab → workspace_secrets row). See
|
||||
// `feedback_upstream_docs_first_before_hypothesizing`: RFC#523's threat
|
||||
// model (issue molecule-ai/internal#523 §"Threat model") names operator-
|
||||
// scope tokens being injected via provision-time env / operator-side
|
||||
// stores — NOT the user's own scoped PAT they explicitly authorized via
|
||||
// the per-workspace Secrets tab.
|
||||
//
|
||||
// The merged map preserves the existing precedence semantic (workspace
|
||||
// rows overwrite global rows on key collision); only the provenance side-
|
||||
// channel is new. Existing single-return callers can ignore globalKeys.
|
||||
//
|
||||
// F1086 / #1206: the returned error string is the SAFE-CANNED message that
|
||||
// gets persisted to workspaces.last_sample_error AND broadcast as the
|
||||
// WORKSPACE_PROVISION_FAILED payload. Internal detail (the secret key name,
|
||||
// the encryption version, the decrypt-error text) is logged here, never
|
||||
// returned to the caller, so it can't leak via the canvas event stream
|
||||
// (cf. TestProvisionWorkspace_NoInternalErrorsInBroadcast).
|
||||
func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, string) {
|
||||
func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, map[string]struct{}, string) {
|
||||
envVars := map[string]string{}
|
||||
globalKeys := map[string]struct{}{}
|
||||
globalRows, globalErr := db.DB.QueryContext(ctx,
|
||||
`SELECT key, encrypted_value, encryption_version FROM global_secrets`)
|
||||
if globalErr == nil {
|
||||
@@ -874,9 +906,10 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
|
||||
decrypted, decErr := crypto.DecryptVersioned(v, ver)
|
||||
if decErr != nil {
|
||||
log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
|
||||
return nil, "failed to decrypt global secret"
|
||||
return nil, nil, "failed to decrypt global secret"
|
||||
}
|
||||
envVars[k] = string(decrypted)
|
||||
globalKeys[k] = struct{}{}
|
||||
}
|
||||
}
|
||||
if err := globalRows.Err(); err != nil {
|
||||
@@ -895,16 +928,22 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
|
||||
decrypted, decErr := crypto.DecryptVersioned(v, ver)
|
||||
if decErr != nil {
|
||||
log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
|
||||
return nil, "failed to decrypt workspace secret"
|
||||
return nil, nil, "failed to decrypt workspace secret"
|
||||
}
|
||||
envVars[k] = string(decrypted)
|
||||
// User-authored workspace_secrets value supersedes any
|
||||
// global_secrets row of the same key — including dropping
|
||||
// the operator-bleed provenance flag. The user explicitly
|
||||
// re-set the value via the canvas Secrets tab, so it is
|
||||
// no longer "the operator-store version."
|
||||
delete(globalKeys, k)
|
||||
}
|
||||
}
|
||||
if err := wsRows.Err(); err != nil {
|
||||
log.Printf("Provisioner: workspace_secrets rows.Err workspace=%s: %v", workspaceID, err)
|
||||
}
|
||||
}
|
||||
return envVars, ""
|
||||
return envVars, globalKeys, ""
|
||||
}
|
||||
|
||||
// provisionWorkspaceCP provisions a workspace via the control plane API.
|
||||
@@ -970,3 +1009,4 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
|
||||
|
||||
log.Printf("CPProvisioner: workspace %s started as machine %s via control plane", workspaceID, machineID)
|
||||
}
|
||||
|
||||
|
||||
@@ -135,6 +135,15 @@ func isForbiddenTenantEnvKey(key string) bool {
|
||||
// message and the structured-extra payload that goes to the
|
||||
// canvas Events tab. Sorting makes the message stable across
|
||||
// Go's randomized map iteration.
|
||||
//
|
||||
// PROVENANCE NOTE: this helper checks by env-var name ONLY and is
|
||||
// unaware of where each value came from. Production provision code
|
||||
// uses findForbiddenTenantEnvKeysFromGlobals instead, restricting
|
||||
// the check to keys originating from the operator-controlled
|
||||
// global_secrets table — see the doc-comment on that function and
|
||||
// the RFC#523 Layer 1 block in prepareProvisionContext. This name-
|
||||
// only helper is kept for the workspace_secrets-write CI lint
|
||||
// (Layer 3) and for tests that pin the deny-set definition.
|
||||
func findForbiddenTenantEnvKeys(envVars map[string]string) []string {
|
||||
if len(envVars) == 0 {
|
||||
return []string{}
|
||||
@@ -149,6 +158,48 @@ func findForbiddenTenantEnvKeys(envVars map[string]string) []string {
|
||||
return found
|
||||
}
|
||||
|
||||
// findForbiddenTenantEnvKeysFromGlobals is the provenance-aware
|
||||
// variant used by RFC#523 Layer 1 in prepareProvisionContext. It
|
||||
// restricts the forbidden-key scan to keys whose value originated
|
||||
// from the operator-controlled `global_secrets` table.
|
||||
//
|
||||
// Fixes the over-fire reported by CTO empirical 2026-05-20: a user
|
||||
// who explicitly pastes their own scoped GitHub PAT under
|
||||
// GITHUB_TOKEN into the canvas Secrets tab (a `workspace_secrets`
|
||||
// row) was being blocked alongside the genuine operator-bleed case.
|
||||
// RFC#523's threat model (issue molecule-ai/internal#523 §"Threat
|
||||
// model") names operator-scope tokens injected via operator-side
|
||||
// stores; user-authored workspace_secrets is out of scope.
|
||||
//
|
||||
// globalSecretKeys is the set returned as the second value from
|
||||
// loadWorkspaceSecrets. A key that exists in BOTH stores is treated
|
||||
// as workspace_secrets (user override wins) — loadWorkspaceSecrets
|
||||
// drops the global flag when the workspace row is read.
|
||||
//
|
||||
// Empty/nil globalSecretKeys means no operator-side source was
|
||||
// loaded (e.g. tests, or table empty); the scan returns no hits.
|
||||
// Deterministic sort order, same as findForbiddenTenantEnvKeys.
|
||||
func findForbiddenTenantEnvKeysFromGlobals(envVars map[string]string, globalSecretKeys map[string]struct{}) []string {
|
||||
if len(envVars) == 0 || len(globalSecretKeys) == 0 {
|
||||
return []string{}
|
||||
}
|
||||
found := make([]string, 0)
|
||||
for k := range globalSecretKeys {
|
||||
if _, present := envVars[k]; !present {
|
||||
// Defensive: a key flagged as global-origin must also
|
||||
// be in the resolved env-set. If not, skip — the
|
||||
// loadWorkspaceSecrets contract guarantees this never
|
||||
// happens, but the helper stays total.
|
||||
continue
|
||||
}
|
||||
if isForbiddenTenantEnvKey(k) {
|
||||
found = append(found, k)
|
||||
}
|
||||
}
|
||||
sort.Strings(found)
|
||||
return found
|
||||
}
|
||||
|
||||
// formatForbiddenTenantEnvError builds the safe-canned user-facing
|
||||
// message for a provision aborted because forbidden env keys are
|
||||
// present in the resolved env-set. The message names the
|
||||
|
||||
@@ -150,6 +150,106 @@ func TestFindForbiddenTenantEnvKeys_SingleAndMultipleSorted(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestFindForbiddenTenantEnvKeysFromGlobals pins the provenance-aware
|
||||
// behaviour added 2026-05-20 to fix the RFC#523 Layer 1 over-fire: a
|
||||
// user-set workspace_secrets row with key=GITHUB_TOKEN must NOT be
|
||||
// flagged, while a global_secrets row of the same key MUST be.
|
||||
//
|
||||
// Cross-references the empirical bug: CTO 2026-05-20 hit
|
||||
// `provision aborted: env var "GITHUB_TOKEN" is operator-scope...`
|
||||
// after pasting their own scoped PAT into the canvas Secrets tab
|
||||
// (workspace_secrets) — the original blanket check fired on the
|
||||
// merged env-set regardless of provenance.
|
||||
func TestFindForbiddenTenantEnvKeysFromGlobals_UserSetAllowed(t *testing.T) {
|
||||
// User pasted their own PAT via canvas Secrets tab —
|
||||
// workspace_secrets row only. globalSecretKeys is empty for
|
||||
// this key, so the check MUST not fire.
|
||||
envVars := map[string]string{
|
||||
"GITHUB_TOKEN": "ghp_FAKEUSERPAT_user_set_via_canvas",
|
||||
"ANTHROPIC_API_KEY": "sk-ant-keep",
|
||||
}
|
||||
globalKeys := map[string]struct{}{} // nothing from global_secrets
|
||||
got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
|
||||
if len(got) != 0 {
|
||||
t.Errorf("user-set workspace_secrets with GITHUB_TOKEN: got %v; want empty (provenance-allowed)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindForbiddenTenantEnvKeysFromGlobals_OperatorLeakBlocked(t *testing.T) {
|
||||
// Operator-store bleed — GITHUB_TOKEN sourced from global_secrets.
|
||||
// This is the literal RFC#523 §"Threat model" attack vector.
|
||||
// Check MUST fire and name GITHUB_TOKEN.
|
||||
envVars := map[string]string{
|
||||
"GITHUB_TOKEN": "ghp_OPERATOR_LEAK_from_global_secrets",
|
||||
"ANTHROPIC_API_KEY": "sk-ant-keep",
|
||||
}
|
||||
globalKeys := map[string]struct{}{
|
||||
"GITHUB_TOKEN": {},
|
||||
"ANTHROPIC_API_KEY": {},
|
||||
}
|
||||
got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
|
||||
if len(got) != 1 || got[0] != "GITHUB_TOKEN" {
|
||||
t.Errorf("operator-leak GITHUB_TOKEN in global_secrets: got %v; want [GITHUB_TOKEN]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindForbiddenTenantEnvKeysFromGlobals_UserOverrideOfGlobalAllowed(t *testing.T) {
|
||||
// Both stores have the key; loadWorkspaceSecrets drops the global
|
||||
// flag when the workspace row supersedes (caller contract).
|
||||
// Simulate that here: globalKeys does NOT contain GITHUB_TOKEN
|
||||
// because workspace_secrets re-set it. Allowed.
|
||||
envVars := map[string]string{
|
||||
"GITHUB_TOKEN": "ghp_USER_RESET_after_global_was_present",
|
||||
}
|
||||
globalKeys := map[string]struct{}{} // workspace overrode → flag dropped
|
||||
got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
|
||||
if len(got) != 0 {
|
||||
t.Errorf("user-override of global GITHUB_TOKEN: got %v; want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindForbiddenTenantEnvKeysFromGlobals_MultipleOperatorLeaks(t *testing.T) {
|
||||
// Multiple operator-leaked tokens — must return sorted slice.
|
||||
envVars := map[string]string{
|
||||
"GITHUB_TOKEN": "leak1",
|
||||
"CP_ADMIN_API_TOKEN": "leak2",
|
||||
"MOLECULE_OPERATOR_HOST": "leak3",
|
||||
"RAILWAY_TOKEN": "leak4",
|
||||
"ANTHROPIC_API_KEY": "user-allowed",
|
||||
}
|
||||
globalKeys := map[string]struct{}{
|
||||
"GITHUB_TOKEN": {},
|
||||
"CP_ADMIN_API_TOKEN": {},
|
||||
"MOLECULE_OPERATOR_HOST": {},
|
||||
"RAILWAY_TOKEN": {},
|
||||
}
|
||||
got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
|
||||
want := []string{"CP_ADMIN_API_TOKEN", "GITHUB_TOKEN", "MOLECULE_OPERATOR_HOST", "RAILWAY_TOKEN"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("operator-leak multi: got %v; want %v", got, want)
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Errorf("operator-leak multi[%d] = %q; want %q (full got=%v)", i, got[i], want[i], got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindForbiddenTenantEnvKeysFromGlobals_EmptyInputs(t *testing.T) {
|
||||
if got := findForbiddenTenantEnvKeysFromGlobals(nil, nil); len(got) != 0 {
|
||||
t.Errorf("nil/nil: got %v; want empty", got)
|
||||
}
|
||||
if got := findForbiddenTenantEnvKeysFromGlobals(map[string]string{}, map[string]struct{}{}); len(got) != 0 {
|
||||
t.Errorf("empty/empty: got %v; want empty", got)
|
||||
}
|
||||
// Non-empty envVars but no global provenance — nothing came from
|
||||
// global_secrets, so nothing to block (even if a workspace_secrets
|
||||
// row exists for GITHUB_TOKEN).
|
||||
if got := findForbiddenTenantEnvKeysFromGlobals(map[string]string{"GITHUB_TOKEN": "ghp_user"}, map[string]struct{}{}); len(got) != 0 {
|
||||
t.Errorf("workspace-only GITHUB_TOKEN: got %v; want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatForbiddenTenantEnvError_Phrasing(t *testing.T) {
|
||||
// Empty input — defensive total function.
|
||||
if msg := formatForbiddenTenantEnvError(nil); !strings.Contains(msg, "RFC#523") {
|
||||
|
||||
@@ -120,38 +120,52 @@ func (h *WorkspaceHandler) prepareProvisionContext(
|
||||
payload models.CreateWorkspacePayload,
|
||||
resetClaudeSession bool,
|
||||
) (*preparedProvisionContext, *provisionAbort) {
|
||||
envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
|
||||
envVars, globalSecretKeys, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
|
||||
if decryptErr != "" {
|
||||
return nil, &provisionAbort{Msg: decryptErr}
|
||||
}
|
||||
|
||||
// RFC#523 Layer 1 (task #146): refuse to start a tenant workspace
|
||||
// when any forbidden operator-scope env var is present in the
|
||||
// resolved secret-load env-set. Runs IMMEDIATELY after
|
||||
// loadWorkspaceSecrets and BEFORE applyAgentGitHTTPCreds — the
|
||||
// per-agent persona injection sets a fallback GITEA_USER/GITEA_TOKEN
|
||||
// pair that the buildContainerEnv forensic #145 guard will strip
|
||||
// later. We want THIS layer to catch leaks from the operator-
|
||||
// controlled stores (global_secrets, workspace_secrets) only, not
|
||||
// the deliberate per-agent platform injection that lives downstream.
|
||||
// RFC#523 Layer 1 (issue molecule-ai/internal#523): refuse to start a
|
||||
// tenant workspace when any forbidden operator-scope env var is
|
||||
// present in the operator-controlled store (global_secrets).
|
||||
//
|
||||
// Threat model is "an upstream secret-writer accidentally widened
|
||||
// the propagation set" — e.g. an operator pastes GITEA_TOKEN into
|
||||
// a workspace_secrets row. Caught here, surfaced loudly to the
|
||||
// canvas Events tab, fail-closed. The existing forensic #145 guard
|
||||
// in provisioner.buildContainerEnv / CPProvisioner.Start stays as
|
||||
// defense-in-depth: it silently strips at container-env-build time.
|
||||
// PROVENANCE-AWARE — fix for the over-fire reported by CTO empirical
|
||||
// 2026-05-20: the original implementation ran this check on the
|
||||
// merged env-set, which conflated two very different sources:
|
||||
//
|
||||
// 1. global_secrets — operator-side store. ANY operator-scope token
|
||||
// here is an upstream bleed (e.g. tenant_secrets_seed.go pre-
|
||||
// 4f45d37 propagating CP-env GITHUB_TOKEN into every fresh
|
||||
// tenant's row). RFC#523's literal threat model.
|
||||
//
|
||||
// 2. workspace_secrets — user-set via the canvas Secrets tab,
|
||||
// authenticated as the workspace owner. If the user pastes
|
||||
// their own scoped GitHub PAT under GITHUB_TOKEN so the agent
|
||||
// can push to their personal repos, that is the system working
|
||||
// as designed — not the leak RFC#523 was written to catch.
|
||||
//
|
||||
// The provenance side-channel from loadWorkspaceSecrets tells us
|
||||
// which keys came from global_secrets (workspace_secrets writes
|
||||
// override and clear the flag, since the user explicitly re-set
|
||||
// the value). We restrict the abort to that set.
|
||||
//
|
||||
// Defense-in-depth NOT removed: provisioner.buildContainerEnv still
|
||||
// runs the forensic #145 silent-strip (lower-confidence late layer),
|
||||
// and workspace/entrypoint.sh has Layer 2 inside the container. If a
|
||||
// real operator-scope token slips into workspace_secrets some other
|
||||
// way, the later layers (and the per-workspace SG, and the per-tenant
|
||||
// VPC isolation) are still in force.
|
||||
//
|
||||
// Key names (not values) are echoed in the user-facing error so
|
||||
// the operator can locate and remove the offending row. Per memory
|
||||
// `feedback_passwords_in_chat_are_burned`, key names are not
|
||||
// secret; values would be.
|
||||
if forbidden := findForbiddenTenantEnvKeys(envVars); len(forbidden) > 0 {
|
||||
if forbidden := findForbiddenTenantEnvKeysFromGlobals(envVars, globalSecretKeys); len(forbidden) > 0 {
|
||||
msg := formatForbiddenTenantEnvError(forbidden)
|
||||
log.Printf("Provisioner: ABORT workspace=%s — forbidden operator-scope env keys present: %v (RFC#523)", workspaceID, forbidden)
|
||||
log.Printf("Provisioner: ABORT workspace=%s — forbidden operator-scope env keys present in global_secrets: %v (RFC#523)", workspaceID, forbidden)
|
||||
return nil, &provisionAbort{
|
||||
Msg: msg,
|
||||
Extra: map[string]interface{}{"error": msg, "forbidden_env_keys": forbidden, "rfc": "523"},
|
||||
Extra: map[string]interface{}{"error": msg, "forbidden_env_keys": forbidden, "rfc": "523", "source": "global_secrets"},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -749,9 +749,8 @@ func TestBuildProvisionerConfig_WorkspacePathFromEnv(t *testing.T) {
|
||||
mock.ExpectQuery(`SELECT COALESCE\(workspace_dir`).
|
||||
WithArgs("ws-env").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
// runtime_image_pins reader removed by RFC internal#617 / task #335
|
||||
// — CP is the SSOT for runtime image pins. No DB lookup here anymore.
|
||||
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
|
||||
@@ -70,6 +70,97 @@ var restartDebounceWindow = 60 * time.Second
|
||||
// workspace-server yet — that's a separate RFC.
|
||||
var restartByIDDropCounter atomic.Uint64
|
||||
|
||||
// fileWriteRestartDebounceWindow is the per-workspace coalescing window for
|
||||
// the file-write → RestartByID trigger fired by templates.go's WriteFile,
|
||||
// DeleteFile, and ReplaceFiles handlers (and template_import.go's variants).
|
||||
//
|
||||
// Background (internal#624 2026-05-20): canvas Save fires N PUT /files
|
||||
// requests in a 30-60s burst (claude-code SEO agent observed 10-17 files in
|
||||
// 60s). Each successful write previously fired `goAsync(RestartByID)`. The
|
||||
// 60s self-fire debounce in RestartByID itself catches calls 1-60s, but
|
||||
// writes at T+65s+ pass the debounce, set pending=true on a still-running
|
||||
// coalesceRestart cycle, and drain immediately into cycle 2 — which DELETEs
|
||||
// + recreates EC2 mid-burst, returning 500 EC2InstanceStateInvalidException
|
||||
// on the in-flight user PUTs.
|
||||
//
|
||||
// 15s is sized to absorb a canvas Save burst (writes typically land within
|
||||
// a 5-10s window) while still letting a deliberate "edit, wait, edit again"
|
||||
// pattern restart twice. Bigger than that would silently swallow legitimate
|
||||
// rapid-iteration edits; smaller would let burst tails leak through.
|
||||
var fileWriteRestartDebounceWindow = 15 * time.Second
|
||||
|
||||
// fileWriteRestartLastFireAt records the last time `maybeRestartAfterFileWrite`
|
||||
// actually fired a restart for each workspace. sync.Map (not RWMutex+map)
|
||||
// because writes happen on every successful file-write handler, reads on
|
||||
// every subsequent file-write handler call — both per-workspace — and the
|
||||
// keys are sparse + long-lived. Stored as int64 unix-nano so the load/store
|
||||
// path can stay lock-free (atomic.Int64 inside sync.Map.Value is fine, but
|
||||
// time.Time itself isn't atomically loadable).
|
||||
var fileWriteRestartLastFireAt sync.Map // map[workspaceID]*atomic.Int64
|
||||
|
||||
// fileWriteRestartDropCounter counts how many file-write restart triggers
|
||||
// were silently coalesced. Same observability rationale as
|
||||
// restartByIDDropCounter — package-level atomic so tests can assert the
|
||||
// drop fired and ops can correlate with "user clicked Save 10 times,
|
||||
// only saw 1 restart cycle".
|
||||
var fileWriteRestartDropCounter atomic.Uint64
|
||||
|
||||
// maybeRestartAfterFileWrite is the call-site debounce wrapper for the 9
|
||||
// file-write trigger sites in templates.go + template_import.go. Replaces
|
||||
// the direct `goAsync(func() { wh.RestartByID(wsID) })` pattern with a
|
||||
// 15s per-workspace coalescing window:
|
||||
//
|
||||
// - First call (no prior fire OR last fire >15s ago): records the
|
||||
// current timestamp and fires goAsync(RestartByID).
|
||||
// - Subsequent calls within 15s of the last fire: silently dropped,
|
||||
// drop counter incremented.
|
||||
//
|
||||
// This is the call-site-layer protection (internal#624 Path A). The drain-
|
||||
// loop layer in coalesceRestart (Path B, re-stamping restartStartedAt per
|
||||
// iteration) is the platform-layer defense in depth — together they close
|
||||
// the file-write tight-loop class regardless of which entry point fires.
|
||||
//
|
||||
// Stateless on the handler so any handler with access to a WorkspaceHandler
|
||||
// can use it; the per-workspace state lives in the package-level sync.Map.
|
||||
func (h *WorkspaceHandler) maybeRestartAfterFileWrite(workspaceID string) {
|
||||
now := time.Now().UnixNano()
|
||||
|
||||
// LoadOrStore the per-workspace last-fire stamp. First write for a
|
||||
// brand-new workspace falls through the CompareAndSwap below because
|
||||
// the zero-init value (0) is far enough in the past to satisfy the
|
||||
// "last fire >15s ago" predicate.
|
||||
sv, _ := fileWriteRestartLastFireAt.LoadOrStore(workspaceID, new(atomic.Int64))
|
||||
stamp := sv.(*atomic.Int64)
|
||||
|
||||
// CAS loop: read last, decide, swap. We use CAS instead of Lock/Unlock
|
||||
// because the typical case is "thousands of writes, one restart per
|
||||
// 15s" — uncontended atomic is ~5ns vs ~30ns mutex. Bounded retry
|
||||
// because in the rare contended case (two writes finishing nanoseconds
|
||||
// apart) one will win the swap and the other will see the new stamp,
|
||||
// drop, and bail.
|
||||
for retry := 0; retry < 4; retry++ {
|
||||
last := stamp.Load()
|
||||
elapsed := time.Duration(now - last)
|
||||
if last != 0 && elapsed < fileWriteRestartDebounceWindow {
|
||||
// Within debounce window — drop silently.
|
||||
fileWriteRestartDropCounter.Add(1)
|
||||
log.Printf("maybeRestartAfterFileWrite: %s — coalesced "+
|
||||
"(last fire %s ago < %s window; total dropped=%d)",
|
||||
workspaceID, elapsed.Round(time.Millisecond),
|
||||
fileWriteRestartDebounceWindow,
|
||||
fileWriteRestartDropCounter.Load())
|
||||
return
|
||||
}
|
||||
if stamp.CompareAndSwap(last, now) {
|
||||
break
|
||||
}
|
||||
// Another writer beat us to the stamp update. Re-read and retry;
|
||||
// the retry will almost certainly see the new value and drop.
|
||||
}
|
||||
|
||||
h.goAsync(func() { h.RestartByID(workspaceID) })
|
||||
}
|
||||
|
||||
// isRestarting reports whether a restart cycle is currently in flight for
|
||||
// the workspace. Callers that have their own "container looks dead" probe
|
||||
// MUST consult this before triggering a restart, because during the
|
||||
@@ -513,6 +604,27 @@ func coalesceRestart(workspaceID string, cycle func()) {
|
||||
// inside provisionWorkspace, so any writes that committed since the
|
||||
// last cycle are picked up. Continues until no pending request was
|
||||
// observed at the top of an iteration.
|
||||
//
|
||||
// internal#624 Path B (defense in depth for the file-write tight-loop
|
||||
// class): re-stamp restartStartedAt at the top of every drain iteration
|
||||
// past the first. The original design (stamp only on false→true edge)
|
||||
// treated all drained pending as "one event from the debounce's POV",
|
||||
// which is correct for the secrets-batch use case but lets a file-write
|
||||
// burst at T+65s of a 60s drain pipe straight into another full cycle.
|
||||
// Re-stamping closes that hole — each drained cycle gets its own fresh
|
||||
// debounce window, so any RestartByID arriving during cycle N is
|
||||
// dropped by shouldDebounceRestart instead of accumulating into
|
||||
// pending=true for cycle N+1.
|
||||
//
|
||||
// The original "one cycle picks up everyone who arrived during it"
|
||||
// semantic still holds for the secrets-write path: callers that hit
|
||||
// coalesceRestart during cycle 1 still set pending=true and still get
|
||||
// their effects landed in cycle 2. What changes is that callers
|
||||
// arriving during cycle 2 (via RestartByID) now hit the re-stamped
|
||||
// debounce and are dropped instead of being chained into cycle 3,
|
||||
// which is exactly the chain that produced the 22:08-22:10 thrash on
|
||||
// 3fe84b89.
|
||||
iteration := 0
|
||||
for {
|
||||
state.mu.Lock()
|
||||
if !state.pending {
|
||||
@@ -520,7 +632,13 @@ func coalesceRestart(workspaceID string, cycle func()) {
|
||||
return // defer clears running
|
||||
}
|
||||
state.pending = false
|
||||
if iteration > 0 {
|
||||
// Re-stamp for drained iterations only; the false→true edge
|
||||
// already stamped at the top of coalesceRestart.
|
||||
state.restartStartedAt = time.Now()
|
||||
}
|
||||
state.mu.Unlock()
|
||||
iteration++
|
||||
|
||||
cycle()
|
||||
}
|
||||
|
||||
@@ -0,0 +1,316 @@
|
||||
package handlers
|
||||
|
||||
// Tests for internal#624 — file-write → RestartByID tight-loop fix.
|
||||
//
|
||||
// Empirical chain (Loki 2026-05-20 22:00-22:11Z on workspace
|
||||
// 3fe84b89-eb65-42fc-ad1f-5c93582ca3e7, claude-code SEO Agent):
|
||||
//
|
||||
// 1. Canvas Save writes 10-17 files in a 30-60s window.
|
||||
// 2. Each successful PUT /files at templates.go:575 / 591 / 607 / 662 /
|
||||
// 682 / 697 (and template_import.go:239 / 275 / 297) fires
|
||||
// `goAsync(func() { wh.RestartByID(wsID) })`.
|
||||
// 3. RestartByID's existing 60s self-fire debounce catches calls 1-60s
|
||||
// after the cycle starts. But writes at T+65s+ pass the debounce,
|
||||
// set pending=true on the still-running coalesceRestart cycle, and
|
||||
// drain IMMEDIATELY into cycle 2 — no re-debounce because the
|
||||
// original drain loop re-uses the same restartStartedAt.
|
||||
// 4. Cycle 2 DELETEs+recreates EC2 mid-burst → user sees
|
||||
// EC2InstanceStateInvalidException 500 on the in-flight PUTs.
|
||||
//
|
||||
// Fix: two layers (both shipped in the same PR).
|
||||
//
|
||||
// Path A (call-site debounce): every file-write trigger goes through
|
||||
// maybeRestartAfterFileWrite, which silently drops re-fires within 15s
|
||||
// of the last fire for the same workspace.
|
||||
//
|
||||
// Path B (drain-loop re-stamp): coalesceRestart now re-stamps
|
||||
// restartStartedAt at the top of each drained iteration, so any
|
||||
// RestartByID arriving during a drained cycle hits a fresh 60s window
|
||||
// and is dropped by shouldDebounceRestart instead of chaining further.
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// resetFileWriteDebounceState wipes the package-level sync.Map + drop
|
||||
// counter for the given workspace ID. Tests must call this between
|
||||
// scenarios because fileWriteRestartLastFireAt is shared.
|
||||
func resetFileWriteDebounceState(workspaceID string) {
|
||||
fileWriteRestartLastFireAt.Delete(workspaceID)
|
||||
fileWriteRestartDropCounter.Store(0)
|
||||
}
|
||||
|
||||
// newFileWriteDebounceHandler constructs a minimal *WorkspaceHandler with
|
||||
// no provisioner so RestartByID short-circuits at HasProvisioner()=false
|
||||
// — we only care that maybeRestartAfterFileWrite reaches goAsync at all.
|
||||
// The asyncWG inside goAsync lets us wait for the goroutine to finish so
|
||||
// we can deterministically observe whether RestartByID was scheduled.
|
||||
func newFileWriteDebounceHandler(t *testing.T) *WorkspaceHandler {
|
||||
t.Helper()
|
||||
return NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_FirstWriteRestarts — the baseline case:
|
||||
// the very first call for a workspace must actually fire goAsync (i.e.
|
||||
// no debounce-drop on the first PUT). Without this the helper would
|
||||
// silently swallow every legitimate single-file save.
|
||||
func TestMaybeRestartAfterFileWrite_FirstWriteRestarts(t *testing.T) {
|
||||
const wsID = "fw-debounce-first"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
|
||||
// Drop counter must NOT have incremented — the call fired.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != 0 {
|
||||
t.Errorf("first call to maybeRestartAfterFileWrite must fire (drop counter must stay 0), got %d", got)
|
||||
}
|
||||
|
||||
// Last-fire timestamp must be populated (non-zero) so the next call
|
||||
// will compare against it.
|
||||
sv, ok := fileWriteRestartLastFireAt.Load(wsID)
|
||||
if !ok {
|
||||
t.Fatal("first call must register the workspace in fileWriteRestartLastFireAt")
|
||||
}
|
||||
stamp := sv.(*atomic.Int64).Load()
|
||||
if stamp == 0 {
|
||||
t.Error("first call must record a non-zero last-fire timestamp")
|
||||
}
|
||||
|
||||
// Wait for the spawned goroutine to finish so it doesn't leak into
|
||||
// the next test (RestartByID will short-circuit on no-provisioner).
|
||||
h.waitAsyncForTest()
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_SecondWriteWithin15sSkipped — the core
|
||||
// fix: a second call within fileWriteRestartDebounceWindow of the first
|
||||
// MUST NOT fire RestartByID. The drop counter must increment by exactly
|
||||
// one and the last-fire timestamp must remain the FIRST call's stamp
|
||||
// (proof that the second call did not overwrite it).
|
||||
func TestMaybeRestartAfterFileWrite_SecondWriteWithin15sSkipped(t *testing.T) {
|
||||
const wsID = "fw-debounce-second-within"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
// First call — fires.
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
h.waitAsyncForTest()
|
||||
|
||||
sv, _ := fileWriteRestartLastFireAt.Load(wsID)
|
||||
firstStamp := sv.(*atomic.Int64).Load()
|
||||
|
||||
// Second call immediately — must be dropped.
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
|
||||
if got := fileWriteRestartDropCounter.Load(); got != 1 {
|
||||
t.Errorf("second call within 15s must increment drop counter by exactly 1, got %d", got)
|
||||
}
|
||||
|
||||
// The CAS-loop must NOT have overwritten the first-call stamp — the
|
||||
// debounce branch short-circuits before the CompareAndSwap.
|
||||
stampAfter := sv.(*atomic.Int64).Load()
|
||||
if stampAfter != firstStamp {
|
||||
t.Errorf("dropped call must NOT update last-fire stamp (preserves debounce window); "+
|
||||
"first=%d after=%d", firstStamp, stampAfter)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_ManyWritesInBurstCoalesceToOne — the
|
||||
// "bonus" regression test called out in the issue: 10 simulated PUTs
|
||||
// over 60s (compressed to a tight loop, all within 15s) must produce
|
||||
// exactly 1 RestartByID schedule and 9 drops. Models the canvas Save
|
||||
// burst shape that triggered the prod incident.
|
||||
func TestMaybeRestartAfterFileWrite_ManyWritesInBurstCoalesceToOne(t *testing.T) {
|
||||
const wsID = "fw-debounce-burst"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
// 10 rapid-fire calls — simulates 10 PUTs landing inside the canvas
|
||||
// Save burst window.
|
||||
const burstSize = 10
|
||||
for i := 0; i < burstSize; i++ {
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
}
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// One fired (call #1) + 9 dropped.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != burstSize-1 {
|
||||
t.Errorf("expected %d drops for a %d-call burst (only call #1 fires), got %d",
|
||||
burstSize-1, burstSize, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_AfterWindowExpiresFiresAgain — outside
|
||||
// the debounce window, the helper must release and fire again. Shrinks
|
||||
// fileWriteRestartDebounceWindow to 5ms so we don't sleep 15s in CI.
|
||||
// Important: without this, a legitimate "user edited, walked away for
|
||||
// a minute, edited again" would never restart and config changes would
|
||||
// never reach the agent.
|
||||
func TestMaybeRestartAfterFileWrite_AfterWindowExpiresFiresAgain(t *testing.T) {
|
||||
const wsID = "fw-debounce-window-expires"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
orig := fileWriteRestartDebounceWindow
|
||||
fileWriteRestartDebounceWindow = 5 * time.Millisecond
|
||||
defer func() { fileWriteRestartDebounceWindow = orig }()
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
h.maybeRestartAfterFileWrite(wsID) // fires
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// Wait past the window.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
h.maybeRestartAfterFileWrite(wsID) // must fire again
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// Drop counter must still be 0 — both calls fired.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != 0 {
|
||||
t.Errorf("second call after window expiry must fire (not drop), got %d drops", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_DifferentWorkspacesIndependent — the
|
||||
// per-workspace state map must isolate: a burst on workspace A must not
|
||||
// affect workspace B's debounce. Pinning so a future "use a single
|
||||
// global atomic" refactor breaks loudly.
|
||||
func TestMaybeRestartAfterFileWrite_DifferentWorkspacesIndependent(t *testing.T) {
|
||||
const wsA = "fw-debounce-ws-a"
|
||||
const wsB = "fw-debounce-ws-b"
|
||||
resetFileWriteDebounceState(wsA)
|
||||
resetFileWriteDebounceState(wsB)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
// 5 calls on A, all but one drop.
|
||||
for i := 0; i < 5; i++ {
|
||||
h.maybeRestartAfterFileWrite(wsA)
|
||||
}
|
||||
h.waitAsyncForTest()
|
||||
|
||||
dropsAfterA := fileWriteRestartDropCounter.Load()
|
||||
|
||||
// First call on B — must fire (its own independent window).
|
||||
h.maybeRestartAfterFileWrite(wsB)
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// B's call must not have incremented the drop counter — it fired.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != dropsAfterA {
|
||||
t.Errorf("workspace B's first call must fire (not share workspace A's debounce); "+
|
||||
"drops after A=%d, drops after B=%d", dropsAfterA, got)
|
||||
}
|
||||
|
||||
// Both workspaces must have their own last-fire entries.
|
||||
if _, ok := fileWriteRestartLastFireAt.Load(wsA); !ok {
|
||||
t.Error("workspace A missing from fileWriteRestartLastFireAt")
|
||||
}
|
||||
if _, ok := fileWriteRestartLastFireAt.Load(wsB); !ok {
|
||||
t.Error("workspace B missing from fileWriteRestartLastFireAt")
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_ConcurrentCallsSafelyDebounced — the
|
||||
// CAS-loop contract: many goroutines hitting the helper concurrently
|
||||
// must still produce at most one fired call (drops = N-1). Pinning the
|
||||
// "thousands of writes, one restart" performance shape called out in
|
||||
// the helper's comment. Uses sync.WaitGroup to release all goroutines
|
||||
// in a tight burst so the CAS is genuinely contended.
|
||||
func TestMaybeRestartAfterFileWrite_ConcurrentCallsSafelyDebounced(t *testing.T) {
|
||||
const wsID = "fw-debounce-concurrent"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
const goroutines = 50
|
||||
start := make(chan struct{})
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < goroutines; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
<-start // hold every goroutine at the gate
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
}()
|
||||
}
|
||||
close(start) // release the herd
|
||||
wg.Wait()
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// Exactly N-1 drops: one goroutine wins the CAS and fires, all
|
||||
// other N-1 see a fresh stamp and drop into the debounce branch.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != goroutines-1 {
|
||||
t.Errorf("expected %d drops for %d concurrent callers (exactly one fires), got %d",
|
||||
goroutines-1, goroutines, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCoalesceRestart_DrainRespectsRestartedAtBetweenIterations —
|
||||
// Path B regression: when coalesceRestart drains a pending request into
|
||||
// a follow-up cycle, the restartStartedAt timestamp must be re-stamped
|
||||
// for that follow-up iteration. Without this, a RestartByID arriving
|
||||
// during cycle 2 would hit a stale 60s window (computed from cycle 1's
|
||||
// start) and could pass the debounce just because cycle 1 + cycle 2's
|
||||
// runtime exceeded 60s combined.
|
||||
//
|
||||
// The test fires cycle 1 → completes → sets pending=true to trigger
|
||||
// cycle 2 → asserts that restartStartedAt was advanced for the drained
|
||||
// iteration. The cycle function itself just records the wall-clock at
|
||||
// which it observed restartStartedAt, so the test can compare cycle 1's
|
||||
// stamp vs cycle 2's stamp.
|
||||
func TestCoalesceRestart_DrainRespectsRestartedAtBetweenIterations(t *testing.T) {
|
||||
const wsID = "fw-debounce-drain-restamp"
|
||||
resetRestartStatesFor(wsID)
|
||||
|
||||
// Capture the restartStartedAt observed at the top of each cycle
|
||||
// iteration. The cycle reads it directly from the state map so we
|
||||
// see what coalesceRestart wrote.
|
||||
var stamps []time.Time
|
||||
var stampsMu sync.Mutex
|
||||
cycleCount := 0
|
||||
cycle := func() {
|
||||
sv, _ := restartStates.Load(wsID)
|
||||
state := sv.(*restartState)
|
||||
state.mu.Lock()
|
||||
stampsMu.Lock()
|
||||
stamps = append(stamps, state.restartStartedAt)
|
||||
stampsMu.Unlock()
|
||||
state.mu.Unlock()
|
||||
|
||||
cycleCount++
|
||||
if cycleCount == 1 {
|
||||
// While inside cycle 1, set pending=true so the drain loop
|
||||
// runs cycle 2 next iteration. Mirrors the prod shape: a
|
||||
// PUT lands during cycle 1, sets pending=true via
|
||||
// RestartByID → coalesceRestart's pending branch.
|
||||
state.mu.Lock()
|
||||
state.pending = true
|
||||
state.mu.Unlock()
|
||||
|
||||
// Sleep briefly so cycle 2's stamp is observably later
|
||||
// than cycle 1's. Without a real wall-clock gap the
|
||||
// assertion can't tell re-stamp from no-op.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
coalesceRestart(wsID, cycle)
|
||||
|
||||
stampsMu.Lock()
|
||||
defer stampsMu.Unlock()
|
||||
if len(stamps) != 2 {
|
||||
t.Fatalf("expected 2 cycle iterations (original + drained pending), got %d", len(stamps))
|
||||
}
|
||||
if !stamps[1].After(stamps[0]) {
|
||||
t.Errorf("Path B regression: cycle 2's restartStartedAt (%v) must be AFTER "+
|
||||
"cycle 1's (%v) — drained iterations must re-stamp so the self-fire "+
|
||||
"debounce window resets per cycle. Without this, a RestartByID arriving "+
|
||||
"during cycle 2 sees a stale window and can chain into cycle 3.",
|
||||
stamps[1], stamps[0])
|
||||
}
|
||||
}
|
||||
@@ -37,21 +37,37 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/uploads"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// Per-file size cap. Mirrors workspace-side ingest_handler
|
||||
// (workspace/internal_chat_uploads.py:CHAT_UPLOAD_MAX_FILE_BYTES) and the
|
||||
// push-mode chat upload cap (chat_files.go:chatUploadMaxBytes). Pinned at
|
||||
// the DB level via the size_bytes CHECK constraint (currently
|
||||
// 104857600 per migration 20260519200000_pending_uploads_bump_size_cap);
|
||||
// this Go-side constant exists so the Put implementation can reject
|
||||
// before round-tripping to Postgres.
|
||||
// MaxFileBytes is the per-file size cap enforced by Put / PutBatch
|
||||
// before any DB round-trip. Mirrors workspace-side ingest
|
||||
// (workspace/internal_chat_uploads.py:CHAT_UPLOAD_MAX_FILE_BYTES) and
|
||||
// push-mode chat upload cap (chat_files.go:chatUploadMaxBytes). Also
|
||||
// pinned at the DB level via the pending_uploads.size_bytes CHECK
|
||||
// constraint (currently <=104857600 per migration
|
||||
// 20260519200000_pending_uploads_bump_size_cap); this Go-side const
|
||||
// exists so a 100 MB+1 byte payload is rejected before Postgres has
|
||||
// to look at it.
|
||||
//
|
||||
// Kept consistent with push-mode (mc#1588) per CTO directive 2026-05-19.
|
||||
// SSOT follow-up: GET /uploads/limits will let every surface read the
|
||||
// live cap rather than each pinning its own copy.
|
||||
const MaxFileBytes = 100 * 1024 * 1024
|
||||
// SSOT (task #320): the value derives from uploads.DefaultUploadLimits()
|
||||
// — the single source consumed by GET /uploads/limits. Bumping the cap
|
||||
// is a one-line edit in internal/uploads/limits.go; this constant
|
||||
// follows. The migration's size_bytes CHECK upper bound must be raised
|
||||
// in lockstep (separate migration file) since DB constraints can't read
|
||||
// Go vars at runtime.
|
||||
//
|
||||
// Why "var" instead of "const": Go disallows initializing a const from
|
||||
// a function call. The runtime cost is zero (DefaultUploadLimits is a
|
||||
// pure literal-builder) and tests can still treat it as effectively
|
||||
// immutable — no caller mutates it.
|
||||
//
|
||||
// Why "int" instead of "int64": the existing API surface is
|
||||
// len(content)-comparison + make([]byte, MaxFileBytes+1) call sites,
|
||||
// both of which want int. We convert from the int64 SSOT field once,
|
||||
// here, rather than thread int64 through every caller.
|
||||
var MaxFileBytes = int(uploads.DefaultUploadLimits().PerFileBytes)
|
||||
|
||||
// ErrNotFound is returned by Get / MarkFetched / Ack when the row is
|
||||
// absent. Callers turn this into HTTP 404. Treat acked + expired rows
|
||||
|
||||
@@ -105,19 +105,27 @@ type WorkspaceConfig struct {
|
||||
WorkspaceAccess string // #65: "none" (default), "read_only", or "read_write"
|
||||
ResetClaudeSession bool // #12: if true, discard the claude-sessions volume before start (fresh session dir)
|
||||
|
||||
// Image, when non-empty, overrides the runtime→image lookup. The handler
|
||||
// layer sets this to the digest-pinned form (`<base>@sha256:<digest>`)
|
||||
// when an operator has promoted a specific runtime build via the
|
||||
// runtime_image_pins table (#2272 layer 1). Empty = legacy behavior,
|
||||
// fall back to RuntimeImages[Runtime] which resolves to the moving
|
||||
// `:latest` tag.
|
||||
// Image, when non-empty, overrides the runtime→image lookup. CP
|
||||
// (molecule-controlplane) is the single SSOT for runtime image digest
|
||||
// pins via its migrations/027_runtime_image_pins table — the pin is
|
||||
// applied at CP's provisioner layer before the workspace-server even
|
||||
// runs, so under the current architecture this field is always empty
|
||||
// on the workspace-server side. Empty = fall back to RuntimeImages
|
||||
// [Runtime] which resolves to the moving `:latest` tag.
|
||||
//
|
||||
// Historical note: molecule-core's own runtime_image_pins table
|
||||
// (workspace-server/migrations 047) was the original aspirational
|
||||
// design (#2272 layer 1) but never received a writer; RFC internal#617 /
|
||||
// task #335 retired the dead reader + table in favor of CP-as-SSOT.
|
||||
Image string
|
||||
}
|
||||
|
||||
// selectImage resolves the final Docker image ref for a workspace. The handler
|
||||
// layer is the source of truth — if it set cfg.Image (the digest-pinned form
|
||||
// from runtime_image_pins, #2272), honor that. Otherwise fall back to the
|
||||
// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior).
|
||||
// supplied by CP, the SSOT for runtime image pins; molecule-core's own
|
||||
// runtime_image_pins reader retired by RFC internal#617 / task #335), honor
|
||||
// that. Otherwise fall back to the runtime→tag lookup in RuntimeImages
|
||||
// (legacy `:latest` behavior).
|
||||
//
|
||||
// Fail-closed contract (RFC internal#483 / security review 4269 /
|
||||
// feedback_platform_must_hardgate_base_contract): if the workspace NAMES a
|
||||
@@ -378,7 +386,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
// + `docker build`s it locally. Replace the placeholder image ref with
|
||||
// the SHA-pinned tag of the freshly-built image before ContainerCreate.
|
||||
//
|
||||
// Pinned overrides (cfg.Image set, e.g. via runtime_image_pins for
|
||||
// Pinned overrides (cfg.Image set, e.g. via CP's runtime_image_pins for
|
||||
// production thin-AMI launches) bypass this path — they pin a digest
|
||||
// the operator chose explicitly.
|
||||
if cfg.Image == "" && cfg.Runtime != "" {
|
||||
@@ -1597,3 +1605,4 @@ func parseOCIPlatform(s string) *ocispec.Platform {
|
||||
}
|
||||
return &ocispec.Platform{OS: parts[0], Architecture: parts[1]}
|
||||
}
|
||||
|
||||
|
||||
@@ -506,11 +506,12 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
|
||||
|
||||
// ---------- selectImage (#2272 layer 1) ----------
|
||||
|
||||
// TestSelectImage_PrefersExplicitImage: when the handler resolved a digest
|
||||
// pin via runtime_image_pins, cfg.Image is set. selectImage must honor it
|
||||
// and ignore the cfg.Runtime → :latest fallback. This is the load-bearing
|
||||
// invariant for digest pinning — if it ever silently reverts to :latest,
|
||||
// we lose the "one bad publish doesn't break every workspace" guarantee.
|
||||
// TestSelectImage_PrefersExplicitImage: when CP (the SSOT for runtime image
|
||||
// pins under RFC internal#617 / task #335) supplied a digest pin via
|
||||
// cfg.Image, selectImage must honor it and ignore the cfg.Runtime → :latest
|
||||
// fallback. This is the load-bearing invariant for digest pinning — if it
|
||||
// ever silently reverts to :latest, we lose the "one bad publish doesn't
|
||||
// break every workspace" guarantee.
|
||||
func TestSelectImage_PrefersExplicitImage(t *testing.T) {
|
||||
pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
|
||||
got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
|
||||
|
||||
@@ -89,11 +89,13 @@ func RegistryHost() string {
|
||||
// RuntimeImage returns the canonical image reference for the given runtime,
|
||||
// using the current RegistryPrefix() and the moving `:latest` tag.
|
||||
//
|
||||
// For SHA-pinned references (production thin-AMI launches), the
|
||||
// runtime_image_pins lookup in handlers/runtime_image_pin.go strips the
|
||||
// `:latest` suffix and appends an immutable `@sha256:<digest>` from the DB.
|
||||
// That code path naturally inherits any RegistryPrefix() change because it
|
||||
// reads from RuntimeImages[runtime] and only re-formats the tag suffix.
|
||||
// SHA-pinned references for production thin-AMI launches are applied by CP
|
||||
// (molecule-controlplane) at its provisioner layer using CP's
|
||||
// migrations/027_runtime_image_pins table, which is the single SSOT for
|
||||
// runtime image pins. The local digest-pin reader that previously lived at
|
||||
// handlers/runtime_image_pin.go was retired by RFC internal#617 / task #335
|
||||
// (it never had a writer; the table was always empty so the reader hit
|
||||
// sql.ErrNoRows and fell through to :latest on every provision).
|
||||
//
|
||||
// Returns the empty string for unknown runtimes; callers should fall through
|
||||
// to DefaultImage in that case (matching legacy behavior).
|
||||
@@ -117,3 +119,4 @@ func computeRuntimeImages() map[string]string {
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/plugins"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/uploads"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"
|
||||
"github.com/docker/docker/client"
|
||||
"github.com/gin-contrib/cors"
|
||||
@@ -105,6 +106,24 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
|
||||
c.JSON(200, gin.H{"git_sha": buildinfo.GitSHA})
|
||||
})
|
||||
|
||||
// Upload limits — public, no auth. Single source of truth for
|
||||
// per-file / per-request / max-attachments caps consumed by the
|
||||
// canvas (chat upload pre-flight), the workspace python ingest
|
||||
// (push + poll), and any future client. Background: task #320 +
|
||||
// the SSOT-follow-up markers in pendinguploads/storage.go +
|
||||
// handlers/chat_files.go + canvas/.../chat/uploads.ts. Existence
|
||||
// reason — mc#1588 raised push-mode caps and mc#1589 had to catch
|
||||
// up the poll-mode + DB CHECK side a day later because the
|
||||
// constants were duplicated across 5 surfaces. Public is
|
||||
// intentional: these are platform constraints every uploader
|
||||
// already learns the hard way via a 413 — exposing them via API
|
||||
// removes the "guess the cap then retry on rejection" UX.
|
||||
// Cached in the binary via uploads.DefaultUploadLimits(); no DB
|
||||
// round-trip per request.
|
||||
r.GET("/uploads/limits", func(c *gin.Context) {
|
||||
c.JSON(200, uploads.DefaultUploadLimits())
|
||||
})
|
||||
|
||||
// /admin/liveness — per-subsystem last-tick timestamps. Operators read this
|
||||
// to catch stuck-but-not-crashed goroutines (the failure mode that caused
|
||||
// the 12h scheduler outage of 2026-04-14, issue #85). Any subsystem whose
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
package router
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/uploads"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// uploads_limits_route_test.go — task #320 SSOT endpoint.
|
||||
//
|
||||
// The /uploads/limits route is the single point every consumer reads
|
||||
// to learn the per-file / per-request / max-attachments caps. Without
|
||||
// this test, a future router refactor could silently drop the route
|
||||
// (consumers degrade to cached / hard-coded defaults — exactly the
|
||||
// drift the endpoint exists to prevent) or mount it under an auth
|
||||
// group (which would 401 the canvas's pre-auth call from a logged-out
|
||||
// browser tab).
|
||||
//
|
||||
// The contract being pinned:
|
||||
// 1. The route is registered and reachable.
|
||||
// 2. The route is PUBLIC — no AdminAuth, no WorkspaceAuth. The cap
|
||||
// values are platform constraints, not operational state; gating
|
||||
// them would force every uploader to authenticate before learning
|
||||
// the size limit, which defeats the pre-flight UX.
|
||||
// 3. The wire shape matches uploads.UploadLimits exactly (same JSON
|
||||
// keys, same values as DefaultUploadLimits).
|
||||
// 4. The in-tree Go consumers (pendinguploads.MaxFileBytes,
|
||||
// handlers.chatUploadMaxBytes) AGREE with the endpoint's value.
|
||||
// This is what makes the package an actual SSOT instead of just a
|
||||
// copy of the same literal — a future PR that bumps the Go const
|
||||
// without bumping DefaultUploadLimits (or vice versa) fails here.
|
||||
|
||||
// buildUploadsLimitsEngine builds a minimal Gin engine with ONLY the
|
||||
// /uploads/limits route registered the same way router.Setup does. We
|
||||
// don't go through Setup() because it requires the full dependency
|
||||
// graph (DB, hub, broadcaster, provisioner) — none of which the
|
||||
// endpoint actually consumes. The route is a pure literal.
|
||||
func buildUploadsLimitsEngine(t *testing.T) *gin.Engine {
|
||||
t.Helper()
|
||||
gin.SetMode(gin.TestMode)
|
||||
r := gin.New()
|
||||
r.GET("/uploads/limits", func(c *gin.Context) {
|
||||
c.JSON(200, uploads.DefaultUploadLimits())
|
||||
})
|
||||
return r
|
||||
}
|
||||
|
||||
func TestUploadsLimits_Public_Returns200(t *testing.T) {
|
||||
r := buildUploadsLimitsEngine(t)
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/uploads/limits", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status: want 200, got %d (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadsLimits_ReturnsDefaultValues(t *testing.T) {
|
||||
r := buildUploadsLimitsEngine(t)
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/uploads/limits", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
|
||||
var got uploads.UploadLimits
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil {
|
||||
t.Fatalf("unmarshal response: %v (body=%s)", err, w.Body.String())
|
||||
}
|
||||
|
||||
want := uploads.DefaultUploadLimits()
|
||||
if got != want {
|
||||
t.Errorf("endpoint payload diverged from DefaultUploadLimits:\n got: %+v\n want: %+v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUploadsLimits_AgreesWith_InTreeGoConsumers(t *testing.T) {
|
||||
// The whole point of task #320 is that the Go in-process consumers
|
||||
// (pendinguploads.MaxFileBytes, handlers.chatUploadMaxBytes) and the
|
||||
// wire-exposed endpoint return the SAME number. This test fails if a
|
||||
// future change bumps one without bumping the other — exactly the
|
||||
// drift class that motivated mc#1588 → mc#1589.
|
||||
//
|
||||
// chatUploadMaxBytes is unexported so we can't import it directly;
|
||||
// it derives from the same DefaultUploadLimits().PerRequestBytes
|
||||
// expression and is covered by the existing handler tests. We pin
|
||||
// pendinguploads.MaxFileBytes here as the exported Go-side mirror.
|
||||
want := uploads.DefaultUploadLimits().PerFileBytes
|
||||
if int64(pendinguploads.MaxFileBytes) != want {
|
||||
t.Errorf("pendinguploads.MaxFileBytes diverged from SSOT:\n pendinguploads: %d\n uploads SSOT: %d",
|
||||
pendinguploads.MaxFileBytes, want)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
// Package uploads is the single source of truth for chat-upload sizing
|
||||
// constraints across every layer of the platform.
|
||||
//
|
||||
// Before this package existed the same numbers were duplicated across at
|
||||
// least five surfaces:
|
||||
//
|
||||
// 1. workspace-server Go const — pendinguploads.MaxFileBytes
|
||||
// 2. workspace-server Go const — handlers.chatUploadMaxBytes
|
||||
// 3. workspace Python module — workspace/inbox_uploads.MAX_FILE_BYTES
|
||||
// 4. workspace Python module — workspace/internal_chat_uploads
|
||||
// .CHAT_UPLOAD_MAX_BYTES / .CHAT_UPLOAD_MAX_FILE_BYTES
|
||||
// 5. canvas TypeScript const — canvas/.../chat/uploads.ts MAX_UPLOAD_BYTES
|
||||
//
|
||||
// plus a sixth (the DB CHECK on pending_uploads.size_bytes) and a seventh
|
||||
// (the nginx test-harness client_max_body_size).
|
||||
//
|
||||
// Every cap change required a coordinated edit across all of them. mc#1588
|
||||
// raised push-mode (1, 2, 4, 5, 7) from 50 MB to 100 MB on 2026-05-20;
|
||||
// the matching poll-mode + DB CHECK bumps (3, 6, parts of pendinguploads)
|
||||
// were missed and shipped a day later as mc#1589 (drift window: one day,
|
||||
// production confusion: "why does push work but poll reject the same
|
||||
// file?"). The same drift class is guaranteed to recur on every future cap
|
||||
// change unless the constants converge.
|
||||
//
|
||||
// This package + the GET /uploads/limits endpoint are the convergence
|
||||
// point. The Go consumers reference DefaultUploadLimits() directly; the
|
||||
// out-of-process consumers (workspace Python, canvas TS, python ingest)
|
||||
// can fetch the limits via the public endpoint at startup and cache them.
|
||||
// The migration that defines the DB CHECK references the same numerical
|
||||
// constant via a -- comment so a reviewer can see at a glance whether a
|
||||
// new migration is in sync with the Go default.
|
||||
//
|
||||
// Task tracking: molecule-ai/internal #320 + the legacy SSOT-follow-up
|
||||
// markers in pendinguploads/storage.go, handlers/chat_files.go, and
|
||||
// canvas/src/components/tabs/chat/uploads.ts.
|
||||
package uploads
|
||||
|
||||
// UploadLimits is the wire shape returned by GET /uploads/limits and the
|
||||
// in-process type read by every Go consumer. The JSON tags are part of
|
||||
// the stable public contract — renaming or removing a field is a
|
||||
// breaking change for the canvas + Python consumers.
|
||||
//
|
||||
// New fields MAY be added without a major bump (consumers ignore unknown
|
||||
// keys), but every existing field must keep its name + units forever or
|
||||
// roll out a v2 endpoint.
|
||||
type UploadLimits struct {
|
||||
// PerFileBytes is the hard cap on a single uploaded file. Enforced
|
||||
// in three places: the platform-side handler in chat_files.go
|
||||
// (push + poll paths), the workspace-side ingest in
|
||||
// internal_chat_uploads.py (push) + inbox_uploads.py (poll), and
|
||||
// the canvas-side pre-flight gate before any network I/O. The DB
|
||||
// CHECK on pending_uploads.size_bytes also enforces this value for
|
||||
// the poll-mode staging table.
|
||||
PerFileBytes int64 `json:"per_file_bytes"`
|
||||
|
||||
// PerRequestBytes is the hard cap on the full multipart request
|
||||
// body. With one attachment + minimal multipart framing this is
|
||||
// effectively equal to PerFileBytes; with N attachments it bounds
|
||||
// the sum. Today we keep them equal at 100 MB — a multi-file batch
|
||||
// must collectively fit under the same ceiling as a single large
|
||||
// file. If we ever decouple them (e.g. raise per-request to allow
|
||||
// a 200 MB batch of 50 MB files) this field is where that lands.
|
||||
PerRequestBytes int64 `json:"per_request_bytes"`
|
||||
|
||||
// MaxAttachmentsPerMessage caps the count of files in a single
|
||||
// /chat/uploads request. Defends against a pathological client that
|
||||
// streams 10 000 1-byte files (which would each spawn a row in
|
||||
// pending_uploads, exhaust file descriptors on the workspace side,
|
||||
// and slow chat_files.uploadPollMode's per-file loop to a crawl).
|
||||
// Currently advisory only — consumers are free to read it but no
|
||||
// platform handler enforces it as of task #320 Phase 1. Will be
|
||||
// enforced once the canvas + workspace consumers have rolled.
|
||||
MaxAttachmentsPerMessage int `json:"max_attachments_per_message"`
|
||||
}
|
||||
|
||||
// DefaultUploadLimits returns the production defaults. This is THE
|
||||
// source: every other constant in the codebase that mentions an upload
|
||||
// cap must derive from this function, NOT from a duplicated literal.
|
||||
//
|
||||
// Why a function and not a package-level var: a var would be mutable at
|
||||
// runtime and create the "test modified it and forgot to reset it" class
|
||||
// of flake. Callers that need a per-test override should pass a custom
|
||||
// UploadLimits value through the handler/registration site, not mutate
|
||||
// a global. (No such override exists today; if one is needed in the
|
||||
// future, prefer a WithLimits(UploadLimits) wiring option over a
|
||||
// SetDefault function.)
|
||||
//
|
||||
// Values pinned at 100 MB per CTO directive 2026-05-19, in lockstep
|
||||
// with mc#1588 + mc#1589. Bumping the cap is a coordinated multi-PR
|
||||
// dance: raise this default, ship a DB migration that loosens the
|
||||
// pending_uploads.size_bytes CHECK, raise the nginx
|
||||
// client_max_body_size in tests/harness/cf-proxy/nginx.conf, and
|
||||
// confirm both push-mode + poll-mode E2E. The whole point of this
|
||||
// package is that step 1 is now ONE edit instead of 5.
|
||||
func DefaultUploadLimits() UploadLimits {
|
||||
return UploadLimits{
|
||||
PerFileBytes: 100 * 1024 * 1024, // 100 MB
|
||||
PerRequestBytes: 100 * 1024 * 1024, // 100 MB
|
||||
MaxAttachmentsPerMessage: 10,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package uploads
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestDefaultUploadLimits_PinsCurrentValues guards against a silent
|
||||
// cap change. Any future bump MUST update this test as part of the
|
||||
// same PR — that forces a reviewer to see the cap move and audit the
|
||||
// matching DB migration + nginx config + python/canvas consumer updates.
|
||||
//
|
||||
// If you're updating this test because you bumped the cap: also update
|
||||
// (1) the matching migration's size_bytes CHECK upper bound, (2)
|
||||
// tests/harness/cf-proxy/nginx.conf client_max_body_size, (3) the doc
|
||||
// comments in handlers/chat_files.go + pendinguploads/storage.go +
|
||||
// canvas/.../uploads.ts that quote the cap in English ("100 MB").
|
||||
func TestDefaultUploadLimits_PinsCurrentValues(t *testing.T) {
|
||||
got := DefaultUploadLimits()
|
||||
const oneHundredMB = int64(100 * 1024 * 1024)
|
||||
|
||||
if got.PerFileBytes != oneHundredMB {
|
||||
t.Errorf("PerFileBytes: want %d (100 MB), got %d", oneHundredMB, got.PerFileBytes)
|
||||
}
|
||||
if got.PerRequestBytes != oneHundredMB {
|
||||
t.Errorf("PerRequestBytes: want %d (100 MB), got %d", oneHundredMB, got.PerRequestBytes)
|
||||
}
|
||||
if got.MaxAttachmentsPerMessage != 10 {
|
||||
t.Errorf("MaxAttachmentsPerMessage: want 10, got %d", got.MaxAttachmentsPerMessage)
|
||||
}
|
||||
}
|
||||
|
||||
// TestUploadLimits_JSONShape pins the wire contract. Renaming any of
|
||||
// these JSON keys is a breaking change for the canvas + Python
|
||||
// consumers that fetch GET /uploads/limits. Adding new keys is fine;
|
||||
// renaming or removing requires a new endpoint (v2) and a coordinated
|
||||
// consumer rollout.
|
||||
//
|
||||
// We assert via Marshal+Unmarshal-through-map rather than a literal
|
||||
// JSON string match because Go map ordering in error messages is
|
||||
// stable but a literal would catch every whitespace tweak; this
|
||||
// formulation surfaces the actual field-name regression.
|
||||
func TestUploadLimits_JSONShape(t *testing.T) {
|
||||
in := UploadLimits{
|
||||
PerFileBytes: 1,
|
||||
PerRequestBytes: 2,
|
||||
MaxAttachmentsPerMessage: 3,
|
||||
}
|
||||
raw, err := json.Marshal(in)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
var out map[string]any
|
||||
if err := json.Unmarshal(raw, &out); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
|
||||
// Field names — exact strings the canvas TS + python clients will
|
||||
// key off. Any rename here is a coordinated multi-repo rollout.
|
||||
for _, key := range []string{"per_file_bytes", "per_request_bytes", "max_attachments_per_message"} {
|
||||
if _, ok := out[key]; !ok {
|
||||
t.Errorf("missing JSON key %q in %s", key, string(raw))
|
||||
}
|
||||
}
|
||||
|
||||
// Round-trip preserves values — guards against silently changing
|
||||
// the field encoding (e.g. int → string).
|
||||
var back UploadLimits
|
||||
if err := json.Unmarshal(raw, &back); err != nil {
|
||||
t.Fatalf("re-unmarshal: %v", err)
|
||||
}
|
||||
if back != in {
|
||||
t.Errorf("round-trip mismatch: in=%+v back=%+v", in, back)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
-- Reverse of 20260520120000_drop_runtime_image_pins.up.sql.
|
||||
--
|
||||
-- Recreates the runtime_image_pins table verbatim from migration 047 so a
|
||||
-- down-cycle leaves the schema bit-identical to the state before the drop.
|
||||
-- The `workspaces.runtime_image_digest` column is unaffected by both the
|
||||
-- up and the down (we never touched it on the up side).
|
||||
|
||||
CREATE TABLE IF NOT EXISTS runtime_image_pins (
|
||||
template_name TEXT PRIMARY KEY,
|
||||
digest TEXT NOT NULL CHECK (digest ~ '^sha256:[a-f0-9]{64}$'),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT,
|
||||
notes TEXT
|
||||
);
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
-- Task #335 / RFC internal#617 — drop molecule-core's dead runtime_image_pins
|
||||
-- table. CP (molecule-controlplane migrations/027_runtime_image_pins.up.sql)
|
||||
-- is the single SSOT for runtime image digest pins.
|
||||
--
|
||||
-- Empirical state at the time of this migration (a6e3ff018 finding,
|
||||
-- 2026-05-20): no code in any molecule-ai repo INSERTs or UPDATEs this
|
||||
-- table. The reader in workspace-server/internal/handlers/runtime_image_pin.go
|
||||
-- has been hitting sql.ErrNoRows on every single workspace provision since
|
||||
-- mig 047 landed (PR #2276) — silently falling through to the legacy
|
||||
-- :latest path. Functionally indistinguishable from removing the call entirely.
|
||||
--
|
||||
-- CP's parallel-named table (CP mig 027) has the writer, reader, hard-gate
|
||||
-- (RFC internal#541 Step 2), seeded post-suspension digests (CP mig 028),
|
||||
-- and admin endpoints. CP is now the de-facto SSOT and this migration just
|
||||
-- ratifies that reality by removing the unused copy.
|
||||
--
|
||||
-- CARE ZONE: migration 047 ALSO added `workspaces.runtime_image_digest TEXT`
|
||||
-- and `idx_workspaces_runtime_image_digest`. Per RFC internal#617 §3, that
|
||||
-- column is earmarked for the canvas admin's stale-workspaces panel
|
||||
-- (workspaces still on an old digest after a CP-side promotion). It has no
|
||||
-- current consumer but the cost of keeping it is one nullable column + a
|
||||
-- partial index, and dropping it is a separate decision out of scope here.
|
||||
-- DO NOT touch the column or its index in this migration.
|
||||
|
||||
DROP TABLE IF EXISTS runtime_image_pins;
|
||||
|
||||
Reference in New Issue
Block a user