Compare commits

..

No commits in common. "main" and "fix/canvas-agent-comms-show-task-text" have entirely different histories.

306 changed files with 2742 additions and 28854 deletions

View File

@ -1,591 +0,0 @@
#!/usr/bin/env python3
"""ci-required-drift — RFC internal#219 §4 + §6.
Detects drift between three sources of "what counts as a required check"
for this repo, files (or updates) a `[ci-drift]` Gitea issue when any
pair diverges.
Sources:
A. `.gitea/workflows/ci.yml` jobs (CI source the actual job set)
B. `status_check_contexts` in branch_protections (the merge gate)
C. `REQUIRED_CHECKS` env in audit-force-merge.yml (the audit env)
Three failure classes:
F1 Job in (A) is not under the sentinel's `needs:` — sentinel
doesn't gate it, so a red job on that name can sneak through.
Ignores jobs whose `if:` references `github.event_name` (those
run only on specific events and may be `skipped` legitimately).
F2 Context in (B) corresponds to no emitter i.e. there's no job
in ci.yml whose runtime status-name maps to that context.
A stale required-check name is silent: protection demands a
green it never receives, but Gitea treats absent-as-pending,
not absent-as-red. The gate degrades to advisory.
F3 (B) and (C) are not set-equal. Audit env wider than protection
audit flags non-force-merges as force; narrower real
force-merges are missed.
Idempotency:
Searches OPEN issues by exact title prefix
`[ci-drift] {repo}/{branch}: ` and either edits the existing one
(if any) or POSTs a new one. Never spawns duplicates.
Behavior-based AST gate per `feedback_behavior_based_ast_gates`:
- Job set comes from PyYAML parse of jobs:* keys
- Sentinel needs from PyYAML parse of jobs[sentinel].needs (a list)
- Audit env from PyYAML parse, NOT grep so reformatting the YAML
(block-scalar `|` vs flow-style list) does not break the gate
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import urllib.error
import urllib.parse
import urllib.request
from typing import Any
import yaml # PyYAML 6.0.2 — installed by the workflow before this runs.
# --------------------------------------------------------------------------
# Environment
# --------------------------------------------------------------------------
def env(key: str, *, required: bool = True, default: str | None = None) -> str:
val = os.environ.get(key, default)
if required and not val:
sys.stderr.write(f"::error::missing required env var: {key}\n")
sys.exit(2)
return val or ""
GITEA_TOKEN = env("GITEA_TOKEN", required=False)
GITEA_HOST = env("GITEA_HOST", required=False)
REPO = env("REPO", required=False)
BRANCHES = env("BRANCHES", required=False).split()
SENTINEL_JOB = env("SENTINEL_JOB", required=False)
AUDIT_WORKFLOW_PATH = env("AUDIT_WORKFLOW_PATH", required=False)
CI_WORKFLOW_PATH = env("CI_WORKFLOW_PATH", required=False)
DRIFT_LABEL = env("DRIFT_LABEL", required=False)
OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
def _require_runtime_env() -> None:
"""Enforce env contract — called from `main()` only. Tests import
individual functions without setting the full env contract."""
for key in (
"GITEA_TOKEN",
"GITEA_HOST",
"REPO",
"BRANCHES",
"SENTINEL_JOB",
"AUDIT_WORKFLOW_PATH",
"CI_WORKFLOW_PATH",
"DRIFT_LABEL",
):
if not os.environ.get(key):
sys.stderr.write(f"::error::missing required env var: {key}\n")
sys.exit(2)
# --------------------------------------------------------------------------
# Tiny HTTP helper (no requests dependency)
# --------------------------------------------------------------------------
class ApiError(RuntimeError):
"""Raised when a Gitea API call cannot be trusted to have succeeded.
Covers non-2xx HTTP status AND 2xx with an unparseable JSON body on
endpoints that are documented to return JSON (search/read). Callers
that swallow this and proceed would risk e.g. creating duplicate
`[ci-drift]` issues when a transient 500 hides an existing match.
The cron retries hourly; one fail-loud cycle is fine silent
duplicate creation is not (per Five-Axis review on PR #112).
"""
def api(
method: str,
path: str,
*,
body: dict | None = None,
query: dict[str, str] | None = None,
expect_json: bool = True,
) -> tuple[int, Any]:
"""Tiny HTTP helper around urllib.
Raises ApiError on any non-2xx response. Callers that want
best-effort semantics (e.g. label-apply) must `try/except ApiError`
explicitly making the failure-soft path opt-in rather than the
default closes the duplicate-issue regression class.
For 2xx responses with a JSON body that fails to parse, raises
ApiError when `expect_json=True` (the default for read-shaped
paths). On endpoints that legitimately return non-JSON success
bodies (e.g. some Gitea create echoes see
`feedback_gitea_create_api_unparseable_response`), callers may pass
`expect_json=False` to accept a `_raw` fallthrough but they MUST
then verify success via a follow-up GET, not by trusting the body.
"""
url = f"{API}{path}"
if query:
url = f"{url}?{urllib.parse.urlencode(query)}"
data = None
headers = {
"Authorization": f"token {GITEA_TOKEN}",
"Accept": "application/json",
}
if body is not None:
data = json.dumps(body).encode("utf-8")
headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, method=method, data=data, headers=headers)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
raw = resp.read()
status = resp.status
except urllib.error.HTTPError as e:
raw = e.read()
status = e.code
if not (200 <= status < 300):
snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
raise ApiError(
f"{method} {path} → HTTP {status}: {snippet}"
)
if not raw:
return status, None
try:
return status, json.loads(raw)
except json.JSONDecodeError as e:
if expect_json:
raise ApiError(
f"{method} {path} → HTTP {status} but body is not JSON: {e}"
) from e
# Opt-in raw fallthrough for endpoints with known echo-quirks.
return status, {"_raw": raw.decode("utf-8", errors="replace")}
# --------------------------------------------------------------------------
# YAML loaders — STRICT (reject GitHub-Actions-only syntax)
# --------------------------------------------------------------------------
def load_yaml(path: str) -> dict:
"""Load + parse a workflow YAML. Hard-fail if the file is missing
or doesn't parse — drift-detect cannot make decisions without
knowing the actual job set."""
if not os.path.exists(path):
sys.stderr.write(f"::error::file not found: {path}\n")
sys.exit(3)
with open(path, encoding="utf-8") as f:
try:
doc = yaml.safe_load(f)
except yaml.YAMLError as e:
sys.stderr.write(f"::error::YAML parse error in {path}: {e}\n")
sys.exit(3)
if not isinstance(doc, dict):
sys.stderr.write(f"::error::{path} is not a YAML mapping\n")
sys.exit(3)
return doc
def ci_jobs_all(ci_doc: dict) -> set[str]:
"""Every job key in ci.yml minus the sentinel itself. Used for F1b
(sentinel.needs typo check) needs that name a non-existent job
is a typo regardless of event-gating."""
jobs = ci_doc.get("jobs")
if not isinstance(jobs, dict):
sys.stderr.write("::error::ci.yml has no jobs: mapping\n")
sys.exit(3)
return {k for k in jobs if k != SENTINEL_JOB}
def ci_job_names(ci_doc: dict) -> set[str]:
"""Set of job keys in ci.yml MINUS the sentinel itself MINUS jobs
whose `if:` gates on `github.event_name` (those are event-scoped
and can legitimately be `skipped` for a given trigger; if we
required them under the sentinel `needs:`, every PR-only job
would be `skipped` on push and the sentinel would interpret
`skipped != success` as failure). RFC §4 spec.
Used for F1 (jobs missing from sentinel needs). NOT used for F1b
(typos in needs) see `ci_jobs_all` for that."""
jobs = ci_doc.get("jobs")
if not isinstance(jobs, dict):
sys.stderr.write("::error::ci.yml has no jobs: mapping\n")
sys.exit(3)
names: set[str] = set()
for k, v in jobs.items():
if k == SENTINEL_JOB:
continue
if isinstance(v, dict):
gate = v.get("if")
if isinstance(gate, str) and "github.event_name" in gate:
continue
names.add(k)
return names
def sentinel_needs(ci_doc: dict) -> set[str]:
sentinel = ci_doc.get("jobs", {}).get(SENTINEL_JOB)
if not isinstance(sentinel, dict):
sys.stderr.write(
f"::error::sentinel job '{SENTINEL_JOB}' not found in {CI_WORKFLOW_PATH}\n"
)
sys.exit(3)
needs = sentinel.get("needs", [])
if isinstance(needs, str):
needs = [needs]
if not isinstance(needs, list):
sys.stderr.write("::error::sentinel `needs:` is neither list nor string\n")
sys.exit(3)
return set(needs)
def required_checks_env(audit_doc: dict) -> set[str]:
"""Pull the REQUIRED_CHECKS env value from audit-force-merge.yml.
Walks the YAML AST per `feedback_behavior_based_ast_gates`: we do
NOT grep for `REQUIRED_CHECKS:` that breaks under reformatting,
multi-job workflows, or a future move of the env to a different
step. Instead, look inside every job's every step's `env:` map."""
found: list[str] = []
jobs = audit_doc.get("jobs", {})
if not isinstance(jobs, dict):
sys.stderr.write(f"::warning::{AUDIT_WORKFLOW_PATH} has no jobs: mapping\n")
return set()
for job in jobs.values():
if not isinstance(job, dict):
continue
for step in job.get("steps", []) or []:
if not isinstance(step, dict):
continue
step_env = step.get("env") or {}
if isinstance(step_env, dict) and "REQUIRED_CHECKS" in step_env:
v = step_env["REQUIRED_CHECKS"]
if isinstance(v, str):
found.append(v)
if not found:
sys.stderr.write(
f"::error::REQUIRED_CHECKS env not found in any step of {AUDIT_WORKFLOW_PATH}\n"
)
sys.exit(3)
if len(found) > 1:
# Defensive: refuse to guess which one is canonical.
sys.stderr.write(
f"::error::REQUIRED_CHECKS env present in {len(found)} steps; ambiguous\n"
)
sys.exit(3)
raw = found[0]
# YAML block-scalars (`|`) leave a trailing newline + blanks; trim
# consistently with audit-force-merge.sh's parser so both sides
# produce identical sets.
return {line.strip() for line in raw.splitlines() if line.strip()}
# --------------------------------------------------------------------------
# Mapping: ci.yml job-key → protection context name
# --------------------------------------------------------------------------
def expected_context(job_key: str, workflow_name: str = "ci") -> str:
"""Gitea Actions reports status-check contexts as
"{workflow.name} / {job.name or job.key} ({event})".
For ci.yml the event is `pull_request` on PRs (that's what
`status_check_contexts` records). Job.name defaults to job.key
when no `name:` is set. CP's ci.yml does NOT set per-job `name:`
so the key equals the human-name."""
return f"{workflow_name} / {job_key} (pull_request)"
# --------------------------------------------------------------------------
# Drift detection
# --------------------------------------------------------------------------
def detect_drift(branch: str) -> tuple[list[str], dict]:
"""Returns (findings, debug). Empty findings == no drift."""
findings: list[str] = []
ci_doc = load_yaml(CI_WORKFLOW_PATH)
audit_doc = load_yaml(AUDIT_WORKFLOW_PATH)
jobs = ci_job_names(ci_doc)
jobs_all = ci_jobs_all(ci_doc)
needs = sentinel_needs(ci_doc)
env_set = required_checks_env(audit_doc)
# Protection
# api() raises ApiError on non-2xx; let it propagate so a transient
# 500 fails the run loudly rather than producing a "no drift" lie.
_, protection = api("GET", f"/repos/{OWNER}/{NAME}/branch_protections/{branch}")
if not isinstance(protection, dict):
sys.stderr.write(
f"::error::protection response for {branch} not a JSON object\n"
)
sys.exit(4)
contexts = set(protection.get("status_check_contexts") or [])
# ----- F1: job exists in CI but not under sentinel.needs -----
missing_from_needs = sorted(jobs - needs)
if missing_from_needs:
findings.append(
"F1 — jobs in ci.yml NOT under sentinel `needs:` (sentinel doesn't gate them):\n"
+ "\n".join(f" - {n}" for n in missing_from_needs)
)
# ----- F1b: needs lists a job that doesn't exist (typo) -----
# Compare against jobs_all (incl. event-gated jobs); a typo is a
# typo regardless of `if:` gating.
stale_needs = sorted(needs - jobs_all)
if stale_needs:
findings.append(
"F1b — sentinel `needs:` lists jobs NOT present in ci.yml (typo or removed job):\n"
+ "\n".join(f" - {n}" for n in stale_needs)
)
# ----- F2: protection context has no emitting job -----
# Compute the contexts the CI YAML actually produces. The sentinel
# is in (B) intentionally (`ci / all-required (pull_request)`); we
# whitelist it explicitly.
emitted_contexts = {expected_context(j) for j in jobs} | {expected_context(SENTINEL_JOB)}
# Contexts NOT produced by ci.yml may still come from other
# workflows in the repo (Secret scan etc). We can't enumerate
# every workflow's emissions cheaply; instead, flag only contexts
# whose prefix is `ci / ` (this workflow's emissions) and which
# don't appear in `emitted_contexts`. This narrows F2 to the
# failure class the RFC actually targets without producing noise
# from cross-workflow emitters.
stale_protection = sorted(
c for c in contexts if c.startswith("ci / ") and c not in emitted_contexts
)
if stale_protection:
findings.append(
"F2 — protection `status_check_contexts` entries with `ci / ` prefix that NO "
"job in ci.yml emits (stale name → silent advisory gate):\n"
+ "\n".join(f" - {c}" for c in stale_protection)
)
# ----- F3: audit env vs protection contexts (set-equal) -----
only_in_env = sorted(env_set - contexts)
only_in_protection = sorted(contexts - env_set)
if only_in_env:
findings.append(
"F3a — audit-force-merge.yml `REQUIRED_CHECKS` env has contexts NOT in "
f"branch_protections/{branch}.status_check_contexts (audit would flag "
"non-force-merges as force):\n"
+ "\n".join(f" - {c}" for c in only_in_env)
)
if only_in_protection:
findings.append(
"F3b — branch_protections/{br}.status_check_contexts has contexts NOT in "
"audit-force-merge.yml `REQUIRED_CHECKS` env (real force-merges would be "
"missed):\n".format(br=branch)
+ "\n".join(f" - {c}" for c in only_in_protection)
)
debug = {
"branch": branch,
"ci_jobs": sorted(jobs),
"sentinel_needs": sorted(needs),
"protection_contexts": sorted(contexts),
"audit_env_checks": sorted(env_set),
"expected_contexts": sorted(emitted_contexts),
}
return findings, debug
# --------------------------------------------------------------------------
# Issue file/update
# --------------------------------------------------------------------------
def title_for(branch: str) -> str:
# Idempotency key — keep stable, never include timestamp/SHA.
return f"[ci-drift] {REPO}/{branch}: required-checks divergence detected"
def find_open_issue(title: str) -> dict | None:
"""Return the existing open `[ci-drift]` issue for `title`, or None.
`None` means "search succeeded, no match" NOT "search failed".
Per Five-Axis review on PR #112: returning None on a transient API
error caused the caller to POST a duplicate issue. Now api() raises
ApiError on any non-2xx; we let it propagate. The cron retries
hourly; failing one cycle loudly is strictly better than silently
duplicating.
Gitea issue search returns at most page=50 per page; one page is
enough as long as `[ci-drift]` issues are a tiny minority. (See
follow-up issue for Link-header pagination.)
"""
_, results = api(
"GET",
f"/repos/{OWNER}/{NAME}/issues",
query={"state": "open", "type": "issues", "limit": "50"},
)
if not isinstance(results, list):
raise ApiError(
f"issue search returned non-list body (got {type(results).__name__})"
)
for issue in results:
if issue.get("title") == title:
return issue
return None
def render_body(branch: str, findings: list[str], debug: dict) -> str:
body = [
f"# Drift detected on `{REPO}/{branch}`",
"",
"Auto-filed by `.gitea/workflows/ci-required-drift.yml` "
"(RFC [internal#219](https://git.moleculesai.app/molecule-ai/internal/issues/219) §4 + §6).",
"",
"## Findings",
"",
]
body.extend(findings)
body.extend(
[
"",
"## Resolution",
"",
"- **F1 / F1b**: add the missing job to `all-required.needs:` "
"in `.gitea/workflows/ci.yml`, or remove the stale entry.",
"- **F2**: rename the protection context to match an emitter, "
"or remove it from `status_check_contexts` "
"(PATCH `/api/v1/repos/{owner}/{repo}/branch_protections/{branch}`).",
"- **F3a / F3b**: bring `REQUIRED_CHECKS` env in "
"`.gitea/workflows/audit-force-merge.yml` into set-equality with "
"`status_check_contexts` (single PR, both files).",
"",
"## Debug",
"",
"```json",
json.dumps(debug, indent=2, sort_keys=True),
"```",
"",
"_This issue is idempotent: drift-detect runs hourly at `:17` "
"and edits this body in place. Close the issue once the drift "
"is fixed; the next hourly run will reopen if drift returns._",
]
)
return "\n".join(body)
def file_or_update(
branch: str,
findings: list[str],
debug: dict,
*,
dry_run: bool = False,
) -> None:
"""File a new `[ci-drift]` issue, or PATCH the existing one in place.
`dry_run=True` skips every side-effecting Gitea call (issue
search, POST, PATCH, label apply) and prints the would-be issue
title + body to stdout. Useful for local testing and for
debugging drift output without polluting the issue tracker.
"""
title = title_for(branch)
body = render_body(branch, findings, debug)
if dry_run:
print(f"::notice::[dry-run] would file/update drift issue for {branch}")
print(f"::group::[dry-run] title")
print(title)
print(f"::endgroup::")
print(f"::group::[dry-run] body")
print(body)
print(f"::endgroup::")
return
existing = find_open_issue(title)
if existing:
num = existing["number"]
api(
"PATCH",
f"/repos/{OWNER}/{NAME}/issues/{num}",
body={"body": body},
)
print(f"::notice::Updated existing drift issue #{num} for {branch}")
return
_, created = api(
"POST",
f"/repos/{OWNER}/{NAME}/issues",
body={"title": title, "body": body, "labels": []},
)
if not isinstance(created, dict):
sys.stderr.write("::error::POST issue response not a JSON object\n")
sys.exit(5)
new_num = created.get("number")
print(f"::warning::Filed new drift issue #{new_num} for {branch}")
# Apply label by name (Gitea's add-labels endpoint accepts label IDs;
# look up id by name once). Best-effort: failure to label is logged
# but does not fail the audit run — the issue itself IS the alarm.
try:
_, labels = api("GET", f"/repos/{OWNER}/{NAME}/labels")
except ApiError as e:
sys.stderr.write(f"::warning::could not list labels: {e}\n")
return
label_id = None
if isinstance(labels, list):
for lbl in labels:
if lbl.get("name") == DRIFT_LABEL:
label_id = lbl.get("id")
break
if label_id is not None and new_num:
try:
api(
"POST",
f"/repos/{OWNER}/{NAME}/issues/{new_num}/labels",
body={"labels": [label_id]},
)
except ApiError as e:
sys.stderr.write(
f"::warning::could not apply label '{DRIFT_LABEL}' to #{new_num}: {e}\n"
)
else:
sys.stderr.write(f"::warning::label '{DRIFT_LABEL}' not found on repo\n")
# --------------------------------------------------------------------------
# Main
# --------------------------------------------------------------------------
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
p = argparse.ArgumentParser(
prog="ci-required-drift",
description="Detect drift between ci.yml, branch_protections, "
"and audit-force-merge.yml REQUIRED_CHECKS env.",
)
p.add_argument(
"--dry-run",
action="store_true",
help="Detect + print findings to stdout; do NOT file or PATCH "
"the `[ci-drift]` issue. Useful for local testing and for "
"previewing output before turning the workflow loose.",
)
return p.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = _parse_args(argv)
_require_runtime_env()
for branch in BRANCHES:
findings, debug = detect_drift(branch)
if findings:
print(f"::warning::Drift detected on {branch}:")
for f in findings:
print(f)
file_or_update(branch, findings, debug, dry_run=args.dry_run)
else:
print(f"::notice::No drift on {branch}.")
print(json.dumps(debug, indent=2, sort_keys=True))
# Exit 0 even on drift — the issue IS the alarm, not a red workflow.
# A red workflow here would page on a CI rename until the issue is
# opened, doubling the noise. The issue itself is the actionable
# surface. (`api()` raising ApiError is the only path that exits
# non-zero, by design: a transient Gitea outage should fail loudly.)
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,589 +0,0 @@
#!/usr/bin/env python3
"""main-red-watchdog — Option C of the "main NEVER goes red" directive.
Tracking: molecule-core#420.
What it does (one cron tick):
1. GET /api/v1/repos/{owner}/{repo}/branches/{watch_branch}
current HEAD SHA on the watched branch.
2. GET /api/v1/repos/{owner}/{repo}/commits/{SHA}/status
combined status + per-context statuses.
3. If combined state is `failure` (or any individual status is
`failure`): open or PATCH an idempotent
`[main-red] {repo}: {SHA[:10]}` issue. Body lists each failed
status context with `target_url` + `description`.
4. If combined state is `success`: close any open `[main-red]
{repo}: ...` issue on a previous SHA with a
"main returned to green at SHA {current_SHA}" comment.
5. Emit one Loki-shaped JSON line via `logger -t main-red-watchdog`
so `reference_obs_stack_phase1`'s Vector → Loki path ingests an
alert event (queryable in Grafana as
`{tenant="operator-host"} |~ "main-red-watchdog"`).
What it does NOT do:
- Auto-revert anything. Option B is explicitly rejected per
`feedback_no_such_thing_as_flakes` + `feedback_fix_root_not_symptom`.
- Page on its own failures. If api() raises ApiError (transient
Gitea outage), the workflow run fails LOUDLY by re-raise exactly
the contract `feedback_api_helper_must_raise_not_return_dict`
enforces. Silent fallthrough would re-introduce the duplicate-issue
regression class.
- Exit non-zero on RED. The issue IS the alarm; failing the watchdog
on red would double-page (red workflow + open issue) and create
silent-loop risk if the watchdog itself flakes.
Idempotency strategy:
Title is keyed on `{SHA[:10]}` (commit-scoped), NOT just `main`.
Rationale:
- A fix-forward changes HEAD next cron tick sees a new SHA;
auto-close logic closes the prior `[main-red] OLD_SHA` issue and
(if the new HEAD is also red, e.g. a different test fails) files
a fresh `[main-red] NEW_SHA`. Lineage is preserved.
- A revert that happens to land back on a previously-red SHA
(rare) would refer to a CLOSED issue; the watchdog never reopens.
That's a deliberate trade-off — the operator will see the latest
open issue's `closed` event in the activity feed.
This module is import-safe: tests import individual functions without
invoking main(), so module-level reads use env-with-default and the
runtime contract enforcement lives in `_require_runtime_env()`.
Run locally (dry-run, no API mutation):
GITEA_TOKEN=... GITEA_HOST=git.moleculesai.app REPO=owner/repo \\
WATCH_BRANCH=main RED_LABEL=tier:high \\
python3 .gitea/scripts/main-red-watchdog.py --dry-run
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import urllib.error
import urllib.parse
import urllib.request
from typing import Any
# --------------------------------------------------------------------------
# Environment
# --------------------------------------------------------------------------
def _env(key: str, *, default: str = "") -> str:
"""Read an env var with a default. Module-import-safe — tests can
import this script without setting the full env contract."""
return os.environ.get(key, default)
GITEA_TOKEN = _env("GITEA_TOKEN")
GITEA_HOST = _env("GITEA_HOST")
REPO = _env("REPO")
WATCH_BRANCH = _env("WATCH_BRANCH", default="main")
RED_LABEL = _env("RED_LABEL", default="tier:high")
OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""
# Title prefix — kept short and stable so the idempotency search can
# match by exact title without parsing.
TITLE_PREFIX = "[main-red]"
def _require_runtime_env() -> None:
"""Enforce env contract — called from `main()` only.
Tests import individual functions without setting the full env
contract. Mirrors the CP `ci-required-drift.py` pattern so the
runtime guard is a single chokepoint.
"""
for key in ("GITEA_TOKEN", "GITEA_HOST", "REPO", "WATCH_BRANCH", "RED_LABEL"):
if not os.environ.get(key):
sys.stderr.write(f"::error::missing required env var: {key}\n")
sys.exit(2)
# --------------------------------------------------------------------------
# Tiny HTTP helper — raises on non-2xx + on JSON-decode-of-expected-JSON.
# --------------------------------------------------------------------------
class ApiError(RuntimeError):
"""Raised when a Gitea API call cannot be trusted to have succeeded.
Covers non-2xx HTTP status AND 2xx with an unparseable JSON body on
endpoints documented to return JSON. Callers that swallow this and
proceed risk e.g. creating duplicate `[main-red]` issues when a
transient 500 hides an existing match. Per
`feedback_api_helper_must_raise_not_return_dict`: soft-failure is
opt-in via `expect_json=False`, never the default.
"""
def api(
method: str,
path: str,
*,
body: dict | None = None,
query: dict[str, str] | None = None,
expect_json: bool = True,
) -> tuple[int, Any]:
"""Tiny HTTP helper around urllib.
Raises ApiError on any non-2xx response, and on JSON-decode failure
when `expect_json=True` (the default for read-shaped paths). Mirrors
the CP ci-required-drift.py contract exactly so behaviour is
cross-checkable.
"""
url = f"{API}{path}"
if query:
url = f"{url}?{urllib.parse.urlencode(query)}"
data = None
headers = {
"Authorization": f"token {GITEA_TOKEN}",
"Accept": "application/json",
}
if body is not None:
data = json.dumps(body).encode("utf-8")
headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, method=method, data=data, headers=headers)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
raw = resp.read()
status = resp.status
except urllib.error.HTTPError as e:
raw = e.read()
status = e.code
if not (200 <= status < 300):
snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
raise ApiError(f"{method} {path} → HTTP {status}: {snippet}")
if not raw:
return status, None
try:
return status, json.loads(raw)
except json.JSONDecodeError as e:
if expect_json:
raise ApiError(
f"{method} {path} → HTTP {status} but body is not JSON: {e}"
) from e
# Opt-in raw fallthrough for endpoints with known echo-quirks
# (`feedback_gitea_create_api_unparseable_response`). Caller
# MUST verify success via a follow-up GET, not by trusting body.
return status, {"_raw": raw.decode("utf-8", errors="replace")}
# --------------------------------------------------------------------------
# Gitea reads
# --------------------------------------------------------------------------
def get_head_sha(branch: str) -> str:
"""HEAD SHA of `branch`. Raises ApiError on non-2xx."""
_, body = api("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}")
if not isinstance(body, dict):
raise ApiError(f"branch {branch} response not a JSON object")
commit = body.get("commit")
if not isinstance(commit, dict):
raise ApiError(f"branch {branch} response missing `commit` object")
sha = commit.get("id") or commit.get("sha")
if not isinstance(sha, str) or len(sha) < 7:
raise ApiError(f"branch {branch} response has no usable commit SHA")
return sha
def get_combined_status(sha: str) -> dict:
"""Combined commit status for `sha`. Gitea returns:
{
"state": "success" | "failure" | "pending" | "error",
"statuses": [
{"context": "...", "state": "success|failure|pending|error",
"target_url": "...", "description": "..."},
...
],
...
}
Raises ApiError on non-2xx.
"""
_, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
if not isinstance(body, dict):
raise ApiError(f"status for {sha} response not a JSON object")
return body
def is_red(status: dict) -> tuple[bool, list[dict]]:
"""Return (is_red, failed_statuses).
A commit is "red" if combined state is `failure` OR any individual
status entry is in {`failure`, `error`}. `pending` and `success`
do not trip the watchdog pending means CI is still running, and
that's the normal state immediately after a merge.
`failed_statuses` is the list of per-context entries whose own
`state` is in the red set; useful for the issue body.
"""
combined = status.get("state")
statuses = status.get("statuses") or []
red_states = {"failure", "error"}
failed = [
s for s in statuses
if isinstance(s, dict) and s.get("state") in red_states
]
return (combined in red_states or bool(failed), failed)
# --------------------------------------------------------------------------
# Issue file / update / close
# --------------------------------------------------------------------------
def title_for(sha: str) -> str:
"""Idempotency key — `[main-red] {repo}: {SHA[:10]}`.
Commit-scoped. A fix-forward to a new SHA produces a new title; the
prior issue auto-closes via `close_open_red_issues_for_other_shas`.
"""
return f"{TITLE_PREFIX} {REPO}: {sha[:10]}"
def list_open_red_issues() -> list[dict]:
"""All open issues whose title starts with `[main-red] {repo}: `.
Per Five-Axis review on CP#112 (`feedback_api_helper_must_raise_not_return_dict`):
api() raises on non-2xx; we let it propagate. Returning [] on a
transient 500 would cause auto-close to skip the cleanup AND the
file-or-update path to POST a duplicate exactly the regression
class the helper-raises contract closes.
Gitea issue search returns at most 50/page; we only need open
`[main-red]` issues which are by design 1 at any time per repo,
so a single page is enough.
"""
_, results = api(
"GET",
f"/repos/{OWNER}/{NAME}/issues",
query={"state": "open", "type": "issues", "limit": "50"},
)
if not isinstance(results, list):
raise ApiError(
f"issue search returned non-list body (got {type(results).__name__})"
)
prefix = f"{TITLE_PREFIX} {REPO}: "
return [i for i in results if isinstance(i, dict)
and isinstance(i.get("title"), str)
and i["title"].startswith(prefix)]
def find_open_issue_for_sha(sha: str) -> dict | None:
"""Return the existing open `[main-red] {repo}: {SHA[:10]}` issue,
or None if no such issue is open.
`None` means "search succeeded, no match" NOT "search failed".
api() raises ApiError on any non-2xx; the caller can let that
propagate so a transient outage fails loudly instead of silently
duplicating.
"""
target = title_for(sha)
for issue in list_open_red_issues():
if issue.get("title") == target:
return issue
return None
def render_body(sha: str, failed: list[dict], debug: dict) -> str:
"""Issue body. Markdown. Mirrors CP#112's render_body shape."""
lines = [
f"# Main is RED on `{REPO}` at `{sha[:10]}`",
"",
f"Commit: <https://{GITEA_HOST}/{REPO}/commit/{sha}>",
"",
"Auto-filed by `.gitea/workflows/main-red-watchdog.yml` (Option C "
"of the [main-never-red directive]"
f"(https://{GITEA_HOST}/molecule-ai/molecule-core/issues/420)). "
"Per `feedback_no_such_thing_as_flakes` + "
"`feedback_fix_root_not_symptom`: investigate the root cause; do "
"NOT revert as a reflex. The watchdog itself never reverts.",
"",
"## Failed status contexts",
"",
]
if not failed:
lines.append(
"_(Combined state reported `failure`/`error` but no per-context "
"entries were in a red state. This usually means a CI emitter "
"set combined-status directly without a per-context status. "
"Check the most recent workflow run for `main` and trace from "
"there.)_"
)
else:
for s in failed:
ctx = s.get("context", "(no context)")
state = s.get("state", "(no state)")
url = s.get("target_url") or ""
desc = (s.get("description") or "").strip()
entry = f"- **{ctx}** — `{state}`"
if url:
entry += f" → [logs]({url})"
if desc:
entry += f"\n - {desc}"
lines.append(entry)
lines.extend([
"",
"## Resolution path",
"",
"1. Read the failed logs (links above).",
"2. If reproducible locally, fix forward in a PR targeting `main`.",
"3. If the failure is a real flake — STOP. Per "
"`feedback_no_such_thing_as_flakes`, intermittent failures are "
"real bugs. Investigate to root cause; do not mark as flake.",
"4. If the failure is blocking unrelated work for >1 hour, file a "
"follow-up issue and assign someone. Do NOT revert without a "
"human GO per `feedback_prod_apply_needs_hongming_chat_go` "
"(branch protection is a prod surface).",
"",
"## Debug",
"",
"```json",
json.dumps(debug, indent=2, sort_keys=True),
"```",
"",
"_This issue is idempotent: the watchdog runs hourly at `:05` "
"and edits this body in place. When `main` returns to green, the "
"watchdog will close this issue automatically with a "
"\"main returned to green\" comment._",
])
return "\n".join(lines)
def emit_loki_event(event_type: str, sha: str, failed_contexts: list[str]) -> None:
"""Emit a JSON line to syslog tag `main-red-watchdog` for
`reference_obs_stack_phase1` (Vector Loki).
Best-effort: if `logger` isn't on PATH (e.g. local dev macOS without
util-linux logger), print to stderr instead. The Gitea Actions
Ubuntu runner has util-linux preinstalled.
Loki labels: the workflow runs on the Ubuntu runner where Vector is
NOT configured (Vector lives on the operator host + tenants per
`reference_obs_stack_phase1`). The Loki line is still emitted as
stdout JSON so the workflow log itself is parseable; treat the
syslog call as belt-and-braces for the cases where this script is
invoked from a host that DOES have Vector (e.g. operator-host cron
fallback in a follow-up PR).
"""
payload = {
"event_type": event_type,
"repo": REPO,
"sha": sha,
"failed_contexts": failed_contexts,
}
line = json.dumps(payload, sort_keys=True)
# Always print to stdout so the workflow log captures it (machine-
# readable; `gitea run logs` + Loki ingestion via the operator-host
# journald → Vector → Loki path will see this from runners that
# forward stdout). Loki query:
# {source="gitea-actions"} |~ "main_red_detected"
print(f"main-red-watchdog event: {line}")
# Best-effort syslog tag so a future "run from operator-host cron"
# path picks it up directly via the existing Vector pipeline.
if shutil.which("logger"):
try:
subprocess.run(
["logger", "-t", "main-red-watchdog", line],
check=False,
timeout=5,
)
except (OSError, subprocess.SubprocessError) as e:
sys.stderr.write(f"::warning::logger call failed: {e}\n")
def file_or_update_red(
sha: str,
failed: list[dict],
debug: dict,
*,
dry_run: bool = False,
) -> None:
"""Open a new `[main-red] {repo}: {SHA[:10]}` issue, or PATCH the
existing one's body. Idempotent by title."""
title = title_for(sha)
body = render_body(sha, failed, debug)
if dry_run:
print(f"::notice::[dry-run] would file/update main-red issue for {sha[:10]}")
print("::group::[dry-run] title")
print(title)
print("::endgroup::")
print("::group::[dry-run] body")
print(body)
print("::endgroup::")
return
existing = find_open_issue_for_sha(sha)
if existing:
num = existing["number"]
api("PATCH", f"/repos/{OWNER}/{NAME}/issues/{num}", body={"body": body})
print(f"::notice::Updated existing main-red issue #{num} for {sha[:10]}")
return
_, created = api(
"POST",
f"/repos/{OWNER}/{NAME}/issues",
body={"title": title, "body": body, "labels": []},
)
if not isinstance(created, dict):
raise ApiError("POST issue response not a JSON object")
new_num = created.get("number")
print(f"::warning::Filed new main-red issue #{new_num} for {sha[:10]}")
# Apply RED_LABEL by id. Gitea's add-labels endpoint takes IDs, not
# names (`feedback_gitea_label_delete_by_id` — same rule for add).
# Best-effort: label failure is logged but does not fail the run.
try:
_, labels = api("GET", f"/repos/{OWNER}/{NAME}/labels")
except ApiError as e:
sys.stderr.write(f"::warning::could not list labels: {e}\n")
return
label_id = None
if isinstance(labels, list):
for lbl in labels:
if isinstance(lbl, dict) and lbl.get("name") == RED_LABEL:
label_id = lbl.get("id")
break
if label_id is not None and new_num:
try:
api(
"POST",
f"/repos/{OWNER}/{NAME}/issues/{new_num}/labels",
body={"labels": [label_id]},
)
except ApiError as e:
sys.stderr.write(
f"::warning::could not apply label '{RED_LABEL}' to #{new_num}: {e}\n"
)
else:
sys.stderr.write(f"::warning::label '{RED_LABEL}' not found on repo\n")
def close_open_red_issues_for_other_shas(
current_sha: str,
*,
dry_run: bool = False,
) -> int:
"""When main is green at current_sha, close any open `[main-red]`
issues whose title references a different SHA. Returns the number
of issues closed.
Lineage note: we only close issues whose title prefix matches; if
a human renamed the issue or added a suffix this won't touch it.
That's intentional — manual editorial state takes precedence.
"""
target_title = title_for(current_sha)
open_red = list_open_red_issues()
closed = 0
for issue in open_red:
if issue.get("title") == target_title:
# Same SHA — caller should not have invoked this if main is
# green. Skip defensively.
continue
num = issue.get("number")
if not isinstance(num, int):
continue
comment = (
f"`main` returned to green at SHA `{current_sha}` "
f"(<https://{GITEA_HOST}/{REPO}/commit/{current_sha}>). "
"Closing automatically. If the underlying root cause is "
"not yet understood, reopen this issue and file a "
"postmortem — green-by-flake is still a bug per "
"`feedback_no_such_thing_as_flakes`."
)
if dry_run:
print(f"::notice::[dry-run] would close issue #{num} ({issue.get('title')})")
closed += 1
continue
# Comment first, then close. Order matters: a closed issue can
# still receive comments, but the activity-feed ordering reads
# better with the explanation arriving just before the close.
api(
"POST",
f"/repos/{OWNER}/{NAME}/issues/{num}/comments",
body={"body": comment},
)
api(
"PATCH",
f"/repos/{OWNER}/{NAME}/issues/{num}",
body={"state": "closed"},
)
print(f"::notice::Closed main-red issue #{num} (green at {current_sha[:10]})")
closed += 1
return closed
# --------------------------------------------------------------------------
# Main
# --------------------------------------------------------------------------
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
p = argparse.ArgumentParser(
prog="main-red-watchdog",
description="Detect post-merge CI red on the watched branch and "
"file an idempotent issue. Option C of the main-never-red directive.",
)
p.add_argument(
"--dry-run",
action="store_true",
help="Detect + print the would-be issue title/body to stdout; do "
"NOT POST/PATCH/close any issues. Useful for local testing.",
)
return p.parse_args(argv)
def run_once(*, dry_run: bool = False) -> int:
"""One watchdog tick. Returns 0 on green or red-issue-filed; lets
ApiError propagate on transient outage (workflow run fails loudly,
which is correct per the helper-raises contract)."""
sha = get_head_sha(WATCH_BRANCH)
status = get_combined_status(sha)
red, failed = is_red(status)
debug = {
"branch": WATCH_BRANCH,
"sha": sha,
"combined_state": status.get("state"),
"failed_contexts": [s.get("context") for s in failed],
"all_contexts": [
{"context": s.get("context"), "state": s.get("state")}
for s in (status.get("statuses") or [])
if isinstance(s, dict)
],
}
if red:
failed_ctxs = [s.get("context") for s in failed if s.get("context")]
emit_loki_event("main_red_detected", sha, failed_ctxs)
print(f"::warning::main is RED at {sha[:10]} on {WATCH_BRANCH}: "
f"{len(failed)} failed context(s)")
file_or_update_red(sha, failed, debug, dry_run=dry_run)
else:
# Green (or pending — pending is treated as not-red so we don't
# spam during the post-merge CI window). Close any stale issues
# from earlier SHAs only when we're actually green; pending
# means CI hasn't finished and the prior issue might still be
# accurate.
if status.get("state") == "success":
closed = close_open_red_issues_for_other_shas(sha, dry_run=dry_run)
if closed:
emit_loki_event(
"main_returned_to_green", sha,
[],
)
print(f"::notice::main is GREEN at {sha[:10]} on {WATCH_BRANCH} "
f"(closed {closed} stale issue(s))")
else:
print(f"::notice::main is PENDING at {sha[:10]} on {WATCH_BRANCH} "
f"(combined state={status.get('state')!r}; no action)")
return 0
def main(argv: list[str] | None = None) -> int:
args = _parse_args(argv)
_require_runtime_env()
return run_once(dry_run=args.dry_run)
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,26 +1,10 @@
#!/usr/bin/env bash
# sop-tier-check — verify a Gitea PR satisfies the §SOP-6 approval gate.
#
# Reads the PR's tier label, walks approving reviewers, and checks team
# membership against the tier's approval expression. Passes only when
# ALL clauses in the expression are satisfied by the set of approving
# reviewers (AND-composition; internal#189).
#
# Expression syntax:
# "team-a" — OR-set: any ONE of the comma-separated teams
# "team-a AND team-b" — AND: BOTH must each have ≥1 approver
# "(a,b,c)" — OR-set wrapped in parens; same as "a,b,c"
#
# Example: "qa AND security AND (managers,ceo)" means:
# ≥1 approver in team "qa" AND
# ≥1 approver in team "security" AND
# ≥1 approver in team "managers" OR "ceo"
#
# Per the spec (internal#189), the hard gate here pairs with the
# advisory gate of sop-conformance LLM-judge (internal#188): each
# required-team click must reflect real verification (visible in review
# body or A2A messages), not rubber-stamp APPROVE. Both gates together
# close the "teammate clicks APPROVE without verifying" gap.
# Reads the PR's tier label, walks approving reviewers, and checks each
# approver's Gitea team membership against the tier's eligible-team set.
# Marks pass only when at least one non-author approver is in an eligible
# team.
#
# Invoked from `.gitea/workflows/sop-tier-check.yml`. The workflow sets
# the env vars below; this script does no IO outside of stdout/stderr +
@ -35,48 +19,17 @@
# PR_AUTHOR — login (from github.event.pull_request.user.login)
#
# Optional:
# SOP_DEBUG=1 — print per-API-call diagnostic lines. Default: off.
# SOP_LEGACY_CHECK=1 — revert to OR-gate (≥1 approver from any eligible
# team). Grace window for PRs in-flight when the
# new AND-composition was deployed. Expires 2026-05-17
# (7-day burn-in window; internal#189 Phase 1).
# Set by workflow for PRs merged before the deploy.
# SOP_DEBUG=1 — print per-API-call diagnostic lines (HTTP codes,
# raw response bodies). Default: off.
#
# Stale-status caveat: Gitea Actions does not always re-fire workflows
# on `labeled` / `pull_request_review:submitted` events. If the
# sop-tier-check status is stale (e.g. red after labels/approvals were
# added), push an empty commit to the PR branch to force a synchronize
# event, OR re-request reviews. Tracked: internal#46.
set -euo pipefail
# Ensure jq is available. Runners may not have it pre-installed, and the
# workflow-level jq install can fail on runners with network restrictions
# (GitHub releases not reachable from some runner networks — infra#241
# follow-up). This fallback is idempotent — no-op when jq is already on PATH.
# SOP_FAIL_OPEN=1 makes this always exit 0 so CI never blocks on jq absence.
if ! command -v jq >/dev/null 2>&1; then
echo "::notice::jq not found on PATH — attempting install..."
_jq_installed="no"
# apt-get first (primary) — Ubuntu package mirrors are reliably reachable.
if apt-get update -qq && apt-get install -y -qq jq 2>/dev/null; then
echo "::notice::jq installed via apt-get: $(jq --version)"
_jq_installed="yes"
# GitHub binary as secondary fallback — may fail on restricted networks.
elif timeout 120 curl -sSL \
"https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64" \
-o /usr/local/bin/jq \
&& chmod +x /usr/local/bin/jq; then
echo "::notice::jq binary downloaded: $(/usr/local/bin/jq --version)"
_jq_installed="yes"
fi
if ! command -v jq >/dev/null 2>&1; then
echo "::error::jq installation failed — apt-get and GitHub binary both failed."
echo "::error::sop-tier-check requires jq for all JSON API parsing."
# SOP_FAIL_OPEN=1 is set in the workflow step's env — makes script always
# exit 0 so CI never blocks. The SOP-6 tier review gate remains enforced.
if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
exit 0
fi
exit 1
fi
fi
debug() {
if [ "${SOP_DEBUG:-}" = "1" ]; then
echo " [debug] $*" >&2
@ -124,58 +77,16 @@ if [ -z "$TIER" ]; then
fi
debug "tier=$TIER"
# 2. Tier → required team expression (AND-composition; internal#189)
#
# Expression syntax:
# clause-a AND clause-b AND ... — ALL clauses must pass
# team-a,team-b,team-c — OR-set: ≥1 approver in ANY of these teams
# (team-a,team-b) — same as team-a,team-b (parens optional)
#
# This map is the single source of truth. Update it when the team structure
# or policy changes. Teams referenced here but absent in Gitea are treated
# as unachievable (would always fail) — operators notice the clear error
# and create the missing team.
#
# Current Gitea teams: ceo, engineers, managers
# Future teams (create before removing "???" fallback): qa, security, security-audit
declare -A TIER_EXPR=(
# tier:low — same as previous OR gate: any engineer, manager, or ceo.
["tier:low"]="engineers,managers,ceo"
# 2. Tier → eligible teams
case "$TIER" in
tier:low) ELIGIBLE="engineers managers ceo" ;;
tier:medium) ELIGIBLE="managers ceo" ;;
tier:high) ELIGIBLE="ceo" ;;
esac
debug "eligible_teams=$ELIGIBLE"
# tier:medium — AND of (managers) AND (engineers) AND (qa???,security???)
# The qa+security clause requires both teams to exist; when not yet
# created, the PR author is responsible for adding them before requesting
# approval on a tier:medium PR. Ops: create qa + security Gitea teams
# and update this map to remove the "???" markers (internal#189 follow-up).
["tier:medium"]="managers AND engineers AND qa???,security???"
# tier:high — ceo only. The AND-composition adds no value for a
# single-team gate, but the framework is wired for consistency.
["tier:high"]="ceo"
)
EXPR="${TIER_EXPR[$TIER]-}"
if [ -z "$EXPR" ]; then
echo "::error::No expression defined for tier $TIER in TIER_EXPR map."
exit 1
fi
debug "expression=$EXPR"
# 3. Legacy OR-gate override (7-day burn-in grace window; internal#189 Phase 1)
if [ "${SOP_LEGACY_CHECK:-}" = "1" ]; then
LEGACY_ELIGIBLE=""
case "$TIER" in
tier:low) LEGACY_ELIGIBLE="engineers managers ceo" ;;
tier:medium) LEGACY_ELIGIBLE="managers ceo" ;;
tier:high) LEGACY_ELIGIBLE="ceo" ;;
esac
echo "::notice::SOP_LEGACY_CHECK=1 — using OR-gate ({$LEGACY_ELIGIBLE}) for this PR."
ELIGIBLE="$LEGACY_ELIGIBLE"
fi
# 4. Resolve all team names → IDs
# /orgs/{org}/teams/{slug}/... endpoints don't exist on Gitea 1.22;
# we use /teams/{id}.
# Resolve team-name → team-id once. /orgs/{org}/teams/{slug}/... endpoints
# don't exist on Gitea 1.22; we have to use /teams/{id}.
ORG_TEAMS_FILE=$(mktemp)
trap 'rm -f "$ORG_TEAMS_FILE"' EXIT
HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \
@ -186,194 +97,53 @@ if [ "${SOP_DEBUG:-}" = "1" ]; then
head -c 300 "$ORG_TEAMS_FILE" >&2; echo >&2
fi
if [ "$HTTP_CODE" != "200" ]; then
echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope."
echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope. Add a SOP_TIER_CHECK_TOKEN secret with read:organization scope at the org level."
exit 1
fi
# Collect every team name that appears in the expression.
# Bash word-splitting on $EXPR splits on spaces, so "AND" appears as a
# token. We skip it explicitly.
declare -A TEAM_ID
_all_teams=""
for _raw_clause in $EXPR; do
# Strip parens and split on comma.
_clause=${_raw_clause//[()]/}
for _t in $(echo "$_clause" | tr ',' '\n'); do
_t=$(echo "$_t" | tr -d '[:space:]')
[ -z "$_t" ] && continue
# Skip AND / OR operator tokens (bash word-split produced them from
# spaces in the expression string).
[ "$_t" = "AND" ] || [ "$_t" = "OR" ] && continue
# Skip if already in set.
case " $_all_teams " in
*" $_t "*) ;; # already present
*) _all_teams="${_all_teams} $_t " ;;
esac
done
done
for _t in $_all_teams; do
_t=$(echo "$_t" | tr -d ' ')
[ -z "$_t" ] && continue
_id=$(jq -r --arg t "$_t" '.[] | select(.name==$t) | .id' <"$ORG_TEAMS_FILE" | head -1)
if [ -z "$_id" ] || [ "$_id" = "null" ]; then
# "??" suffix marks teams that don't exist yet (tier:medium qa/security).
# Treat as permanently failing clause; clear error message guides ops.
if [[ "$_t" == *"???" ]]; then
debug "team \"$_t\" not found (expected — pending team creation per internal#189)"
continue
fi
_visible=$(jq -r '.[]?.name? // empty' <"$ORG_TEAMS_FILE" 2>/dev/null | tr '\n' ' ')
echo "::error::Team \"$_t\" referenced in tier $TIER expression but not found in org $OWNER. Teams visible: $_visible"
for T in $ELIGIBLE; do
ID=$(jq -r --arg t "$T" '.[] | select(.name==$t) | .id' <"$ORG_TEAMS_FILE" | head -1)
if [ -z "$ID" ] || [ "$ID" = "null" ]; then
VISIBLE=$(jq -r '.[]?.name? // empty' <"$ORG_TEAMS_FILE" 2>/dev/null | tr '\n' ' ')
echo "::error::Team \"$T\" not found in org $OWNER. Teams visible: $VISIBLE"
exit 1
fi
TEAM_ID[$_t]="$_id"
debug "team-id: $_t$_id"
TEAM_ID[$T]="$ID"
debug "team-id: $T$ID"
done
# 5. Read approving reviewers
# 3. Read approving reviewers
REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]')
if [ -z "$APPROVERS" ]; then
echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics."
echo "::error::No approving reviews. Tier $TIER requires approval from {$ELIGIBLE} (non-author)."
exit 1
fi
debug "approvers: $(echo "$APPROVERS" | tr '\n' ' ')"
# 6. For each approver: skip self-review; probe team membership by id.
# Build $APPROVER_TEAMS[<user>]=space-surrounded team names (e.g. " managers ").
# Pre/post spaces ensure case patterns *${_t}* match even when the name
# is the first or last entry (bash case *word* needs delimiters on both sides).
#
# FALLBACK: if ALL team probes return 403 (token lacks read:org scope),
# fall back to /orgs/{org}/members/{user}. This returns 204 for any org
# member — a superset of team membership. Accepting it as a fallback means
# the gate passes when the token is scoped to repo+user only (core-bot PAT).
# This is safe because: (a) org membership is a prerequisite for every
# eligible team; (b) the AND-composition of internal#189 still requires
# multiple independent approvers; (c) any token with read:repository can
# see the approving reviews, so bypass requires a colluding approver.
declare -A APPROVER_TEAMS
# 4. For each approver: check non-author + team membership (by id)
OK=""
for U in $APPROVERS; do
[ "$U" = "$PR_AUTHOR" ] && debug "skip self-review by $U" && continue
_any_team_success="no"
for T in "${!TEAM_ID[@]}"; do
if [ "$U" = "$PR_AUTHOR" ]; then
debug "skip self-review by $U"
continue
fi
for T in $ELIGIBLE; do
ID="${TEAM_ID[$T]}"
CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
"${API}/teams/${ID}/members/${U}")
debug "probe: $U in team $T (id=$ID) → HTTP $CODE"
if [ "$CODE" = "200" ] || [ "$CODE" = "204" ]; then
APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
debug "$U qualifies for team $T"
_any_team_success="yes"
echo "::notice::approver $U is in team $T (eligible for $TIER)"
OK="yes"
break
fi
done
# Fallback: if every team probe returned 403, try org membership.
# "??" teams were never resolved to IDs so they never entered the loop.
# If the user is an org member, credit them as being in each queried team
# (engineers, managers, ceo are all org-level). This is safe because org
# membership is a prerequisite for all three, and bypass requires a colluding
# approver (same risk as before the AND-composition).
if [ "$_any_team_success" = "no" ]; then
ORG_CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
"${API}/orgs/${OWNER}/members/${U}")
debug "probe: $U in org $OWNER (fallback) → HTTP $ORG_CODE"
if [ "$ORG_CODE" = "204" ]; then
for T in "${!TEAM_ID[@]}"; do
APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
done
debug "$U credited as org member for all queried teams (fallback — token may lack read:org)"
fi
fi
[ -n "$OK" ] && break
done
# 7. Evaluate the tier expression.
#
# legacy OR-gate: use the simplified loop from before internal#189.
if [ -n "${LEGACY_ELIGIBLE:-}" ]; then
OK=""
for _u in "${!APPROVER_TEAMS[@]}"; do
for _t2 in $LEGACY_ELIGIBLE; do
case "${APPROVER_TEAMS[$_u]}" in
*${_t2}*)
echo "::notice::approver $_u is in team $_t2 (eligible for $TIER)"
OK="yes"
break
;;
esac
done
[ -n "$OK" ] && break
done
if [ -z "$OK" ]; then
echo "::error::Tier $TIER requires approval from a non-author member of {$LEGACY_ELIGIBLE}. Set SOP_DEBUG=1 to see per-probe HTTP codes."
exit 1
fi
echo "::notice::sop-tier-check passed: $TIER (legacy OR-gate)"
exit 0
fi
# AND-gate: evaluate the expression clause by clause.
# _passed_clauses and _failed_clauses accumulate for the status description.
_passed_clauses=""
_failed_clauses=""
for _raw_clause in $EXPR; do
# Normalise: strip parens, replace commas with spaces so bash word-split
# can iterate the OR-set members. The previous form
# _clause=$(echo ... | tr ',' '\n' | tr -d '[:space:]' | grep -v '^$')
# collapsed every member into one concatenated token because
# `tr -d '[:space:]'` strips the very newlines that just separated them
# ("engineers,managers,ceo" -> "engineersmanagersceo"), so the OR-clause
# only ever evaluated as a single nonsense team name and never matched
# APPROVER_TEAMS. Fixed in #229: leave the comma-separated members as
# space-separated tokens for `for _t in $_clause`.
_no_parens=${_raw_clause//[()]/}
_clause=${_no_parens//,/ }
_clause_passed="no"
_clause_names=""
for _t in $_clause; do
# Append (don't overwrite) team name to the human-readable accumulator.
# The previous form `_clause_names="${_clause_names:+, }${_t}"`
# rewrote the variable on every iteration, so the FAIL message only
# ever showed the LAST team. Fixed: prepend prior value before the
# comma-separator, then append the new team name.
_clause_names="${_clause_names}${_clause_names:+, }${_t}"
# Skip teams not yet in Gitea (qa??? / security??? placeholders).
[[ "$_t" == *"???" ]] && debug "clause \"$_t\": skipped (team pending creation)" && continue
[ -z "${TEAM_ID[$_t]:-}" ] && debug "clause \"$_t\": no ID resolved, skipping" && continue
for _u in "${!APPROVER_TEAMS[@]}"; do
# Note: APPROVER_TEAMS values are space-surrounded (e.g. " managers ").
# Pattern *${_t}* matches team name anywhere in the space-padded string.
case "${APPROVER_TEAMS[$_u]}" in
*${_t}*)
_clause_passed="yes"
debug "clause \"$_t\": satisfied by $_u"
break
;;
esac
done
done
# Label for display: strip "???" from pending teams.
_label=$(echo "$_raw_clause" | tr -d '()' | tr ',' '/' | tr -d '[:space:]' | sed 's/???//g')
if [ "$_clause_passed" = "yes" ]; then
# Append (don't overwrite) — same accumulator bug as _clause_names above.
_passed_clauses="${_passed_clauses}${_passed_clauses:+, }$_label"
echo "::notice::clause [$_label]: PASS — satisfied by approving reviewer(s)"
else
_failed_clauses="${_failed_clauses}${_failed_clauses:+, }$_label"
echo "::error::clause [$_label]: FAIL — no approving reviewer belongs to any of these teams (${_clause_names}). Set SOP_DEBUG=1 to see per-team probe results."
fi
done
if [ -n "$_failed_clauses" ]; then
echo ""
echo "::error::sop-tier-check FAILED for $TIER."
echo " Passed :${_passed_clauses}"
echo " Missing:${_failed_clauses}"
echo " All clauses must be satisfied. Each missing team needs an APPROVED review from one of its members."
if [ -z "$OK" ]; then
echo "::error::Tier $TIER requires approval from a non-author member of {$ELIGIBLE}. Got approvers: $APPROVERS — none of them satisfied team membership. Set SOP_DEBUG=1 to see per-probe HTTP codes."
exit 1
fi
echo "::notice::sop-tier-check PASSED: $TIER — all required clauses satisfied [${_passed_clauses}]"
echo "::notice::sop-tier-check passed: $TIER, approver in {$ELIGIBLE}"

View File

@ -1,172 +0,0 @@
#!/usr/bin/env bash
# sop-tier-refire — re-evaluate sop-tier-check and POST status to PR head SHA.
#
# Invoked from `.gitea/workflows/sop-tier-refire.yml` when a repo
# MEMBER/OWNER/COLLABORATOR comments `/refire-tier-check` on a PR.
#
# Behavior:
#
# 1. Resolve PR head SHA + author from PR_NUMBER.
# 2. Rate-limit: if the sop-tier-check context has been POSTed in the
# last 30 seconds, skip (prevents comment-spam status thrash).
# 3. Invoke `.gitea/scripts/sop-tier-check.sh` with the same env the
# canonical workflow provides. This is DRY: we re-use the exact AND-
# composition gate logic, not a watered-down approving-count check.
# 4. POST the resulting status (success on exit 0, failure on non-zero)
# to `/repos/.../statuses/{HEAD_SHA}` with context
# "sop-tier-check / tier-check (pull_request)" — the same context name
# branch protection requires.
#
# Required env (set by sop-tier-refire.yml):
# GITEA_TOKEN — org-level SOP_TIER_CHECK_TOKEN (read:org/user/issue/repo)
# GITEA_HOST — e.g. git.moleculesai.app
# REPO — owner/name
# PR_NUMBER — PR number from issue_comment payload
# COMMENT_AUTHOR — login of the commenter (logged for audit)
#
# Optional:
# SOP_DEBUG=1 — verbose per-API-call diagnostics
# SOP_REFIRE_RATE_LIMIT_SEC — override the 30s rate-limit (default 30)
# SOP_REFIRE_DISABLE_RATE_LIMIT=1 — for tests; skips the rate-limit check
set -euo pipefail
debug() {
if [ "${SOP_DEBUG:-}" = "1" ]; then
echo " [debug] $*" >&2
fi
}
: "${GITEA_TOKEN:?GITEA_TOKEN required}"
: "${GITEA_HOST:?GITEA_HOST required}"
: "${REPO:?REPO required (owner/name)}"
: "${PR_NUMBER:?PR_NUMBER required}"
: "${COMMENT_AUTHOR:=unknown}"
OWNER="${REPO%%/*}"
NAME="${REPO##*/}"
API="https://${GITEA_HOST}/api/v1"
AUTH="Authorization: token ${GITEA_TOKEN}"
CONTEXT="sop-tier-check / tier-check (pull_request)"
RATE_LIMIT_SEC="${SOP_REFIRE_RATE_LIMIT_SEC:-30}"
echo "::notice::sop-tier-refire start: repo=$OWNER/$NAME pr=$PR_NUMBER commenter=$COMMENT_AUTHOR"
# 1. Fetch PR details — need head.sha and user.login.
PR_FILE=$(mktemp)
trap 'rm -f "$PR_FILE"' EXIT
PR_HTTP=$(curl -sS -o "$PR_FILE" -w '%{http_code}' -H "$AUTH" \
"${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
if [ "$PR_HTTP" != "200" ]; then
echo "::error::GET /pulls/$PR_NUMBER returned HTTP $PR_HTTP (body $(head -c 200 "$PR_FILE"))"
exit 1
fi
HEAD_SHA=$(jq -r '.head.sha' <"$PR_FILE")
PR_AUTHOR=$(jq -r '.user.login' <"$PR_FILE")
PR_STATE=$(jq -r '.state' <"$PR_FILE")
if [ -z "$HEAD_SHA" ] || [ "$HEAD_SHA" = "null" ]; then
echo "::error::Could not resolve head.sha from PR #$PR_NUMBER response"
exit 1
fi
debug "head_sha=$HEAD_SHA pr_author=$PR_AUTHOR state=$PR_STATE"
if [ "$PR_STATE" != "open" ]; then
echo "::notice::PR #$PR_NUMBER state is $PR_STATE; refire is a no-op on closed PRs."
exit 0
fi
# 2. Rate-limit: skip if our context was updated in the last $RATE_LIMIT_SEC.
# Gitea statuses endpoint returns latest first; we check the most recent
# entry for our context name.
if [ "${SOP_REFIRE_DISABLE_RATE_LIMIT:-}" != "1" ]; then
STATUSES_FILE=$(mktemp)
trap 'rm -f "$PR_FILE" "$STATUSES_FILE"' EXIT
ST_HTTP=$(curl -sS -o "$STATUSES_FILE" -w '%{http_code}' -H "$AUTH" \
"${API}/repos/${OWNER}/${NAME}/statuses/${HEAD_SHA}?limit=50&sort=newest")
debug "statuses-list HTTP=$ST_HTTP"
if [ "$ST_HTTP" = "200" ]; then
LAST_UPDATED=$(jq -r --arg c "$CONTEXT" \
'[.[] | select(.context == $c)] | first | .updated_at // ""' \
<"$STATUSES_FILE")
if [ -n "$LAST_UPDATED" ] && [ "$LAST_UPDATED" != "null" ]; then
# Parse RFC3339 → epoch. Use python -c for portability (date(1) -d
# differs between BSD/GNU; the Gitea runner is Ubuntu so GNU date
# works, but we keep python for future container variance).
LAST_EPOCH=$(python3 -c "import sys,datetime;print(int(datetime.datetime.fromisoformat(sys.argv[1].replace('Z','+00:00')).timestamp()))" "$LAST_UPDATED" 2>/dev/null || echo "0")
NOW_EPOCH=$(date -u +%s)
AGE=$((NOW_EPOCH - LAST_EPOCH))
debug "last status update: $LAST_UPDATED ($AGE seconds ago)"
if [ "$AGE" -lt "$RATE_LIMIT_SEC" ] && [ "$AGE" -ge 0 ]; then
echo "::notice::sop-tier-refire rate-limited — last status update was ${AGE}s ago (<${RATE_LIMIT_SEC}s window). Try again shortly."
exit 0
fi
fi
fi
fi
# 3. Invoke sop-tier-check.sh with the env it expects. Capture exit code.
# The canonical script reads tier label, walks approving reviewers, and
# evaluates the AND-composition expression — we want the SAME gate, not
# a different gate.
#
# SOP_REFIRE_TIER_CHECK_SCRIPT env var lets tests substitute a mock —
# sop-tier-check.sh uses bash 4+ associative arrays which trigger a known
# bash 3.2 parser bug (`tier: unbound variable` from declare -A with
# `set -u`). Linux Gitea runners ship bash 4/5 so production is fine;
# the override exists so the bash 3.2 dev box can still exercise the
# refire glue logic end-to-end.
SCRIPT="${SOP_REFIRE_TIER_CHECK_SCRIPT:-$(dirname "$0")/sop-tier-check.sh}"
if [ ! -f "$SCRIPT" ]; then
echo "::error::sop-tier-check.sh not found at $SCRIPT — refire requires the canonical script"
exit 1
fi
# Re-invoke. Pipe stdout/stderr through so the runner log shows the
# tier-check decision inline.
set +e
GITEA_TOKEN="$GITEA_TOKEN" \
GITEA_HOST="$GITEA_HOST" \
REPO="$REPO" \
PR_NUMBER="$PR_NUMBER" \
PR_AUTHOR="$PR_AUTHOR" \
SOP_DEBUG="${SOP_DEBUG:-0}" \
SOP_LEGACY_CHECK="${SOP_LEGACY_CHECK:-0}" \
bash "$SCRIPT"
TIER_EXIT=$?
set -e
debug "sop-tier-check.sh exit=$TIER_EXIT"
# 4. POST the resulting status.
if [ "$TIER_EXIT" -eq 0 ]; then
STATE="success"
DESCRIPTION="Refired via /refire-tier-check by $COMMENT_AUTHOR"
else
STATE="failure"
DESCRIPTION="Refired via /refire-tier-check; tier-check failed (see workflow log)"
fi
# Status target_url points at the runner log so a curious reviewer can
# follow it back. SERVER_URL + RUN_ID + JOB_ID isn't trivially constructible
# from the bash env on Gitea 1.22.6, so we point at the PR itself.
TARGET_URL="https://${GITEA_HOST}/${OWNER}/${NAME}/pulls/${PR_NUMBER}"
POST_BODY=$(jq -nc \
--arg state "$STATE" \
--arg context "$CONTEXT" \
--arg description "$DESCRIPTION" \
--arg target_url "$TARGET_URL" \
'{state:$state, context:$context, description:$description, target_url:$target_url}')
POST_FILE=$(mktemp)
trap 'rm -f "$PR_FILE" "${STATUSES_FILE:-}" "$POST_FILE"' EXIT
POST_HTTP=$(curl -sS -o "$POST_FILE" -w '%{http_code}' \
-X POST -H "$AUTH" -H "Content-Type: application/json" \
-d "$POST_BODY" \
"${API}/repos/${OWNER}/${NAME}/statuses/${HEAD_SHA}")
if [ "$POST_HTTP" != "200" ] && [ "$POST_HTTP" != "201" ]; then
echo "::error::POST /statuses/$HEAD_SHA returned HTTP $POST_HTTP (body $(head -c 200 "$POST_FILE"))"
exit 1
fi
echo "::notice::sop-tier-refire posted state=$STATE for context=\"$CONTEXT\" on sha=$HEAD_SHA"
exit "$TIER_EXIT"

View File

@ -1,28 +0,0 @@
#!/usr/bin/env bash
# Mock sop-tier-check.sh for sop-tier-refire tests.
#
# Exits 0 ("PASS") if $MOCK_TIER_RESULT == "pass", else exits 1.
# This lets the refire tests cover the success + failure status-POST
# paths without invoking the real sop-tier-check.sh (which uses bash 4+
# associative arrays — known parser bug on macOS bash 3.2 dev box).
set -euo pipefail
case "${MOCK_TIER_RESULT:-pass}" in
pass)
echo "::notice::mock tier-check: PASS"
exit 0
;;
fail_no_label)
echo "::error::mock tier-check: no tier label"
exit 1
;;
fail_no_approvals)
echo "::error::mock tier-check: no approving reviews"
exit 1
;;
*)
echo "::error::mock tier-check: unknown MOCK_TIER_RESULT=${MOCK_TIER_RESULT:-}"
exit 2
;;
esac

View File

@ -1,208 +0,0 @@
#!/usr/bin/env python3
"""Stub Gitea API for sop-tier-refire test scenarios.
Reads $FIXTURE_STATE_DIR/scenario to decide what to return for each
endpoint the sop-tier-refire.sh + sop-tier-check.sh scripts call.
Captures every POST to /statuses/{sha} into posted_statuses.jsonl so
the test can assert what the script tried to write.
Scenarios:
T1_success tier:low + APPROVED by engineer tier-check passes
T2_no_tier_label no tier label tier-check exits 1 before POST
T3_no_approvals tier:low but zero approving reviews exits 1
T4_closed PR state=closed refire is a no-op
T5_rate_limited last status update 5 seconds ago skip
Usage:
FIXTURE_STATE_DIR=/tmp/x python3 _refire_fixture.py 8080
"""
import datetime
import http.server
import json
import os
import re
import sys
import urllib.parse
STATE_DIR = os.environ["FIXTURE_STATE_DIR"]
def scenario() -> str:
p = os.path.join(STATE_DIR, "scenario")
if not os.path.isfile(p):
return "T1_success"
with open(p) as f:
return f.read().strip()
def now_iso() -> str:
return datetime.datetime.now(datetime.timezone.utc).isoformat()
def append_post(body: dict) -> None:
with open(os.path.join(STATE_DIR, "posted_statuses.jsonl"), "a") as f:
f.write(json.dumps(body) + "\n")
def pr_payload() -> dict:
sc = scenario()
state = "closed" if sc == "T4_closed" else "open"
return {
"number": 999,
"state": state,
"head": {"sha": "deadbeef0000111122223333444455556666"},
"user": {"login": "feature-author"},
}
def labels_payload() -> list:
sc = scenario()
if sc == "T2_no_tier_label":
return [{"name": "bug"}]
# All other scenarios use tier:low
return [{"name": "tier:low"}, {"name": "ci"}]
def reviews_payload() -> list:
sc = scenario()
if sc == "T3_no_approvals":
return []
# All other scenarios have one APPROVED review by an engineer
return [
{
"state": "APPROVED",
"user": {"login": "reviewer-engineer"},
}
]
def teams_payload() -> list:
# Mirror the real molecule-ai org teams referenced in TIER_EXPR
return [
{"id": 5, "name": "ceo"},
{"id": 2, "name": "engineers"},
{"id": 6, "name": "managers"},
]
def statuses_payload() -> list:
sc = scenario()
if sc == "T5_rate_limited":
recent = (
datetime.datetime.now(datetime.timezone.utc)
- datetime.timedelta(seconds=5)
).isoformat()
return [
{
"context": "sop-tier-check / tier-check (pull_request)",
"state": "failure",
"updated_at": recent,
}
]
return []
def user_payload() -> dict:
# Mirrors the WHOAMI probe in sop-tier-check.sh
return {"login": "sop-tier-bot-fixture"}
class Handler(http.server.BaseHTTPRequestHandler):
# Quiet — keep stdout for explicit logs only.
def log_message(self, *args, **kwargs): # noqa: D401
pass
def _json(self, code: int, body) -> None:
payload = json.dumps(body).encode()
self.send_response(code)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def _empty(self, code: int) -> None:
self.send_response(code)
self.send_header("Content-Length", "0")
self.end_headers()
def do_GET(self): # noqa: N802
u = urllib.parse.urlparse(self.path)
path = u.path
if path == "/_ping":
return self._json(200, {"ok": True})
if path == "/api/v1/user":
return self._json(200, user_payload())
# /api/v1/repos/{owner}/{name}/pulls/{n}
m = re.match(r"^/api/v1/repos/[^/]+/[^/]+/pulls/(\d+)$", path)
if m:
return self._json(200, pr_payload())
# /api/v1/repos/{owner}/{name}/issues/{n}/labels
if re.match(r"^/api/v1/repos/[^/]+/[^/]+/issues/\d+/labels$", path):
return self._json(200, labels_payload())
# /api/v1/repos/{owner}/{name}/pulls/{n}/reviews
if re.match(r"^/api/v1/repos/[^/]+/[^/]+/pulls/\d+/reviews$", path):
return self._json(200, reviews_payload())
# /api/v1/orgs/{owner}/teams
if re.match(r"^/api/v1/orgs/[^/]+/teams$", path):
return self._json(200, teams_payload())
# /api/v1/teams/{id}/members/{login} → 204 if user is an engineer
m = re.match(r"^/api/v1/teams/(\d+)/members/([^/]+)$", path)
if m:
team_id, login = m.group(1), m.group(2)
# In our fixture reviewer-engineer ∈ engineers (id=2)
if team_id == "2" and login == "reviewer-engineer":
return self._empty(204)
return self._empty(404)
# /api/v1/orgs/{owner}/members/{login} — fallback path used when
# team-member probes all 403. We don't need it for these tests.
if re.match(r"^/api/v1/orgs/[^/]+/members/[^/]+$", path):
return self._empty(404)
# /api/v1/repos/{owner}/{name}/statuses/{sha}
if re.match(r"^/api/v1/repos/[^/]+/[^/]+/statuses/[^/]+$", path):
return self._json(200, statuses_payload())
return self._json(404, {"path": path, "msg": "fixture: no route"})
def do_POST(self): # noqa: N802
u = urllib.parse.urlparse(self.path)
path = u.path
length = int(self.headers.get("Content-Length") or 0)
raw = self.rfile.read(length) if length else b""
try:
body = json.loads(raw) if raw else {}
except Exception:
body = {"_raw": raw.decode(errors="replace")}
if re.match(r"^/api/v1/repos/[^/]+/[^/]+/statuses/[^/]+$", path):
append_post(body)
# Echo back something status-shaped — script only checks HTTP code.
return self._json(
201,
{
"context": body.get("context"),
"state": body.get("state"),
"created_at": now_iso(),
},
)
return self._json(404, {"path": path, "msg": "fixture: no route"})
def main():
port = int(sys.argv[1])
srv = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
srv.serve_forever()
if __name__ == "__main__":
main()

View File

@ -1,101 +0,0 @@
#!/usr/bin/env bash
# Regression test for #229 — sop-tier-check tier:low OR-clause splitter.
#
# Bug (PR #225 → still broken after PR #231):
# Line ~289 of sop-tier-check.sh used:
# _clause=$(echo "$_raw_clause" | tr -d '()' | tr ',' '\n' | tr -d '[:space:]' | grep -v '^$')
# `tr -d '[:space:]'` strips the newlines that `tr ',' '\n'` just
# inserted, collapsing "engineers,managers,ceo" into a single token
# "engineersmanagersceo". The for-loop then iterates ONCE on a name
# that matches no team, so every tier:low PR fails:
# ::error::clause [engineers/managers/ceo]: FAIL — no approving
# reviewer belongs to any of these teamsengineersmanagersceo
# (note also: missing separators in the error string is bug #2 —
# `_clause_names` used "${var:+, }$x" which OVERWRITES per iteration).
#
# Fix shape (this PR):
# _no_parens=${_raw_clause//[()]/}
# _clause=${_no_parens//,/ } # comma -> space, bash word-split iterates
# _clause_names="${_clause_names}${_clause_names:+, }${_t}" # APPEND, not overwrite
#
# This test extracts the splitter logic and asserts it produces the right
# token list for each of the three tier expressions live in the script.
set -euo pipefail
PASS=0
FAIL=0
assert_eq() {
local label="$1"
local expected="$2"
local got="$3"
if [ "$expected" = "$got" ]; then
echo " PASS $label"
PASS=$((PASS + 1))
else
echo " FAIL $label"
echo " expected: <$expected>"
echo " got: <$got>"
FAIL=$((FAIL + 1))
fi
}
# ----- Splitter under test (mirrors the fixed sop-tier-check.sh block) -----
split_clause() {
local raw="$1"
local no_parens=${raw//[()]/}
local clause=${no_parens//,/ }
local out=""
for _t in $clause; do
out="${out}${out:+|}$_t"
done
echo "$out"
}
echo "test: tier:low OR-clause splits to 3 tokens"
assert_eq "tier:low" "engineers|managers|ceo" "$(split_clause "engineers,managers,ceo")"
echo "test: tier:medium AND-expression — bash word-split on \$EXPR yields 5 tokens"
EXPR="managers AND engineers AND qa???,security???"
out=""
for _raw in $EXPR; do
out="${out}${out:+ ; }$(split_clause "$_raw")"
done
assert_eq "tier:medium" "managers ; AND ; engineers ; AND ; qa???|security???" "$out"
echo "test: tier:high single-team OR-clause"
assert_eq "tier:high" "ceo" "$(split_clause "ceo")"
echo "test: paren-wrapped OR-set unwraps + splits"
assert_eq "paren OR" "managers|ceo" "$(split_clause "(managers,ceo)")"
# ----- _clause_names accumulator (was overwriting per iteration) -----
acc=""
for t in engineers managers ceo; do
acc="${acc}${acc:+, }${t}"
done
assert_eq "_clause_names append" "engineers, managers, ceo" "$acc"
# ----- _failed_clauses / _passed_clauses accumulator across raw clauses -----
acc=""
for c in clauseA clauseB clauseC; do
acc="${acc}${acc:+, }${c}"
done
assert_eq "_failed_clauses append" "clauseA, clauseB, clauseC" "$acc"
# ----- End-to-end OR-gate: simulate APPROVER_TEAMS[core-lead]=' managers ' -----
# The script's case pattern is *${_t}* with a space-padded value.
APPROVER_TEAMS_VAL=" managers "
matched=""
for _t in $(split_clause "engineers,managers,ceo" | tr '|' ' '); do
case "$APPROVER_TEAMS_VAL" in
*${_t}*) matched="$_t"; break ;;
esac
done
assert_eq "OR-gate matches managers" "managers" "$matched"
echo
echo "------"
echo "PASS=$PASS FAIL=$FAIL"
[ "$FAIL" -eq 0 ]

View File

@ -1,297 +0,0 @@
#!/usr/bin/env bash
# Tests for sop-tier-refire.{yml,sh} — internal#292.
#
# Behavior matrix:
#
# T1: PR open + APPROVED via tier:low → script invokes sop-tier-check
# and POSTs status=success.
# T2: PR open + missing tier label → sop-tier-check exits non-zero;
# refire POSTs status=failure (description mentions failure).
# T3: PR open + tier:low but NO approving reviews → sop-tier-check
# exits non-zero; refire POSTs status=failure.
# T4: PR CLOSED → refire exits 0 with no status POST (no-op on closed).
# T5: Rate-limit — recent status update within 30s → refire skips,
# no new POST.
# T6 (yaml-lint): workflow `if:` expression contains author_association
# gate + slash-command-trigger gate + PR-not-issue gate.
# T7 (yaml-lint): workflow file is parseable YAML.
#
# Tests T1-T5 run the real script against a local-fixture HTTP server
# (python http.server with a stub handler — `tests/_refire_fixture.py`)
# so the script's Gitea API calls hit the fixture, not the real Gitea.
#
# Tests T6/T7 are pure YAML checks against the workflow file.
#
# Hostile-self-review (per feedback_assert_exact_not_substring):
# this test MUST FAIL if the workflow or script is absent. Verified by
# running the test before the files exist (covered in the PR body).
set -euo pipefail
THIS_DIR="$(cd "$(dirname "$0")" && pwd)"
SCRIPT_DIR="$(cd "$THIS_DIR/.." && pwd)"
WORKFLOW_DIR="$(cd "$THIS_DIR/../../workflows" && pwd)"
WORKFLOW="$WORKFLOW_DIR/sop-tier-refire.yml"
SCRIPT="$SCRIPT_DIR/sop-tier-refire.sh"
PASS=0
FAIL=0
FAILED_TESTS=""
assert_eq() {
local label="$1"
local expected="$2"
local got="$3"
if [ "$expected" = "$got" ]; then
echo " PASS $label"
PASS=$((PASS + 1))
else
echo " FAIL $label"
echo " expected: <$expected>"
echo " got: <$got>"
FAIL=$((FAIL + 1))
FAILED_TESTS="${FAILED_TESTS} ${label}"
fi
}
assert_contains() {
local label="$1"
local needle="$2"
local haystack="$3"
if printf '%s' "$haystack" | grep -qF "$needle"; then
echo " PASS $label"
PASS=$((PASS + 1))
else
echo " FAIL $label"
echo " needle: <$needle>"
echo " haystack: <$(printf '%s' "$haystack" | head -c 400)>"
FAIL=$((FAIL + 1))
FAILED_TESTS="${FAILED_TESTS} ${label}"
fi
}
assert_file_exists() {
local label="$1"
local path="$2"
if [ -f "$path" ]; then
echo " PASS $label"
PASS=$((PASS + 1))
else
echo " FAIL $label (not found: $path)"
FAIL=$((FAIL + 1))
FAILED_TESTS="${FAILED_TESTS} ${label}"
fi
}
# Existence (foundation — every other test depends on these)
echo
echo "== existence =="
assert_file_exists "workflow file exists" "$WORKFLOW"
assert_file_exists "script file exists" "$SCRIPT"
if [ "$FAIL" -gt 0 ]; then
echo
echo "------"
echo "PASS=$PASS FAIL=$FAIL (existence)"
echo "Cannot proceed without these files."
exit 1
fi
# T6 / T7 — workflow YAML structure
echo
echo "== T6/T7 workflow yaml =="
# YAML parseability
PARSE_OUT=$(python3 -c 'import sys,yaml;yaml.safe_load(open(sys.argv[1]).read());print("ok")' "$WORKFLOW" 2>&1 || true)
assert_eq "T7 workflow parses as YAML" "ok" "$PARSE_OUT"
# Three required gates in the `if:` expression
WORKFLOW_CONTENT=$(cat "$WORKFLOW")
assert_contains "T6a workflow if: contains author_association gate" \
"github.event.comment.author_association" "$WORKFLOW_CONTENT"
assert_contains "T6b workflow if: gates on MEMBER/OWNER/COLLABORATOR" \
'["MEMBER","OWNER","COLLABORATOR"]' "$WORKFLOW_CONTENT"
assert_contains "T6c workflow if: contains slash-command trigger" \
"/refire-tier-check" "$WORKFLOW_CONTENT"
assert_contains "T6d workflow if: gates on PR-not-issue" \
"github.event.issue.pull_request" "$WORKFLOW_CONTENT"
assert_contains "T6e workflow listens on issue_comment" \
"issue_comment" "$WORKFLOW_CONTENT"
assert_contains "T6f workflow requests statuses:write permission" \
"statuses: write" "$WORKFLOW_CONTENT"
# Does NOT check out PR HEAD (security)
if grep -q 'ref: \${{ github.event.pull_request.head' "$WORKFLOW"; then
echo " FAIL T6g workflow MUST NOT check out PR head (security)"
FAIL=$((FAIL + 1))
FAILED_TESTS="${FAILED_TESTS} T6g"
else
echo " PASS T6g workflow does not check out PR head"
PASS=$((PASS + 1))
fi
# T1-T5 — script behavior against a local Gitea-fixture
echo
echo "== T1-T5 script behavior (vs local fixture) =="
# Spin up the fixture HTTP server.
FIXTURE_DIR=$(mktemp -d)
trap 'rm -rf "$FIXTURE_DIR"; [ -n "${FIX_PID:-}" ] && kill "$FIX_PID" 2>/dev/null || true' EXIT
FIXTURE_PY="$THIS_DIR/_refire_fixture.py"
if [ ! -f "$FIXTURE_PY" ]; then
echo "::error::fixture server $FIXTURE_PY missing"
exit 1
fi
FIX_LOG="$FIXTURE_DIR/fixture.log"
FIX_STATE_DIR="$FIXTURE_DIR/state"
mkdir -p "$FIX_STATE_DIR"
# Find an unused port.
FIX_PORT=$(python3 -c 'import socket;s=socket.socket();s.bind(("127.0.0.1",0));print(s.getsockname()[1]);s.close()')
FIXTURE_STATE_DIR="$FIX_STATE_DIR" python3 "$FIXTURE_PY" "$FIX_PORT" \
>"$FIX_LOG" 2>&1 &
FIX_PID=$!
# Wait for fixture readiness.
for _ in $(seq 1 50); do
if curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
break
fi
sleep 0.1
done
if ! curl -fsS "http://127.0.0.1:${FIX_PORT}/_ping" >/dev/null 2>&1; then
echo "::error::fixture server failed to start. Log:"
cat "$FIX_LOG"
exit 1
fi
# Helper: set fixture state for a scenario, then run the script.
# tier_result is one of: pass | fail_no_label | fail_no_approvals.
# The refire script's tier-check invocation is mocked because the real
# sop-tier-check.sh uses bash 4+ associative arrays — incompatible with
# the macOS bash 3.2 dev shell. Linux Gitea runners use bash 4/5 so
# production runs the real script. The mock exercises the success +
# failure branches of refire's status-POST glue.
run_scenario() {
local scenario="$1"
local tier_result="${2:-pass}"
echo "$scenario" >"$FIX_STATE_DIR/scenario"
: >"$FIX_STATE_DIR/posted_statuses.jsonl" # clear status log
local out
set +e
out=$(
PATH="$FIXTURE_DIR/bin:$PATH" \
GITEA_TOKEN="fixture-token" \
GITEA_HOST="fixture.local" \
REPO="molecule-ai/molecule-core" \
PR_NUMBER="999" \
COMMENT_AUTHOR="test-runner" \
SOP_REFIRE_DISABLE_RATE_LIMIT="1" \
SOP_REFIRE_TIER_CHECK_SCRIPT="$THIS_DIR/_mock_tier_check.sh" \
MOCK_TIER_RESULT="$tier_result" \
FIXTURE_PORT="$FIX_PORT" \
bash "$SCRIPT" 2>&1
)
local rc=$?
set -e
echo "$out" >"$FIX_STATE_DIR/last_run.log"
echo "$rc" >"$FIX_STATE_DIR/last_rc"
}
# Install a curl shim that rewrites https://fixture.local → http://127.0.0.1:$PORT
# Use bash prefix-strip (${var#prefix}) — it sidesteps the `/` delimiter
# confusion of ${var/pattern/replacement}.
mkdir -p "$FIXTURE_DIR/bin"
cat >"$FIXTURE_DIR/bin/curl" <<SHIM
#!/usr/bin/env bash
# Test shim: rewrite https://fixture.local/* -> http://127.0.0.1:${FIX_PORT}/*
# The fixture doesn't authenticate; -H Authorization passes through harmlessly.
new_args=()
for a in "\$@"; do
if [[ "\$a" == https://fixture.local/* ]]; then
rest="\${a#https://fixture.local}"
a="http://127.0.0.1:${FIX_PORT}\${rest}"
fi
new_args+=("\$a")
done
exec /usr/bin/curl "\${new_args[@]}"
SHIM
chmod +x "$FIXTURE_DIR/bin/curl"
# T1: tier:low + 1 APPROVED + author is in engineers team → success
run_scenario "T1_success" "pass"
RC=$(cat "$FIX_STATE_DIR/last_rc")
POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
assert_eq "T1 exit code 0 (success)" "0" "$RC"
assert_contains "T1 POSTed state=success" '"state": "success"' "$POSTED"
assert_contains "T1 POST context is sop-tier-check / tier-check" \
'"context": "sop-tier-check / tier-check (pull_request)"' "$POSTED"
assert_contains "T1 description names commenter" "test-runner" "$POSTED"
# T2: missing tier label → tier-check fails → failure status POSTed
run_scenario "T2_no_tier_label" "fail_no_label"
RC=$(cat "$FIX_STATE_DIR/last_rc")
POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
# tier-check.sh exits 1; refire script forwards that exit, so RC != 0
if [ "$RC" -ne 0 ]; then
echo " PASS T2 exit code non-zero (got $RC)"
PASS=$((PASS + 1))
else
echo " FAIL T2 exit code should be non-zero, got 0"
FAIL=$((FAIL + 1))
FAILED_TESTS="${FAILED_TESTS} T2_rc"
fi
assert_contains "T2 POSTed state=failure" '"state": "failure"' "$POSTED"
# T3: tier:low present but ZERO approving reviews → failure
run_scenario "T3_no_approvals" "fail_no_approvals"
RC=$(cat "$FIX_STATE_DIR/last_rc")
POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
if [ "$RC" -ne 0 ]; then
echo " PASS T3 exit code non-zero (got $RC)"
PASS=$((PASS + 1))
else
echo " FAIL T3 exit code should be non-zero, got 0"
FAIL=$((FAIL + 1))
FAILED_TESTS="${FAILED_TESTS} T3_rc"
fi
assert_contains "T3 POSTed state=failure" '"state": "failure"' "$POSTED"
# T4: closed PR — refire is a no-op (no POST, exit 0)
run_scenario "T4_closed" "pass"
RC=$(cat "$FIX_STATE_DIR/last_rc")
POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
assert_eq "T4 closed PR exits 0" "0" "$RC"
assert_eq "T4 closed PR posts no status" "" "$POSTED"
# T5: rate-limit — disable the env override and let scenario set a
# recent statuses entry. Re-enable rate-limit for this scenario by NOT
# passing SOP_REFIRE_DISABLE_RATE_LIMIT.
echo "T5_rate_limited" >"$FIX_STATE_DIR/scenario"
: >"$FIX_STATE_DIR/posted_statuses.jsonl"
set +e
T5_OUT=$(
PATH="$FIXTURE_DIR/bin:$PATH" \
GITEA_TOKEN="fixture-token" \
GITEA_HOST="fixture.local" \
REPO="molecule-ai/molecule-core" \
PR_NUMBER="999" \
COMMENT_AUTHOR="test-runner" \
FIXTURE_PORT="$FIX_PORT" \
bash "$SCRIPT" 2>&1
)
T5_RC=$?
set -e
POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
assert_eq "T5 rate-limited exits 0" "0" "$T5_RC"
assert_contains "T5 rate-limited log says skipped" "rate-limited" "$T5_OUT"
assert_eq "T5 rate-limited posts no status" "" "$POSTED"
echo
echo "------"
echo "PASS=$PASS FAIL=$FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Failed:$FAILED_TESTS"
fi
[ "$FAIL" -eq 0 ]

View File

@ -1,88 +1,58 @@
# audit-force-merge — emit `incident.force_merge` to the runner log when
# a PR is merged with required-status checks NOT all green. Vector picks
# audit-force-merge — emit `incident.force_merge` to runner stdout when
# a PR is merged with required-status-checks not green. Vector picks
# the JSON line off docker_logs and ships to Loki on
# molecule-canonical-obs (per `reference_obs_stack_phase1`); query as:
#
# {host="operator"} |= "event_type" |= "incident.force_merge" | json
#
# Companion to `audit-force-merge.sh` (script-extract pattern, same as
# sop-tier-check). The audit observes BOTH UI-merged and REST-merged PRs
# uniformly per `feedback_gh_cli_merge_lies_use_rest`.
# Closes the §SOP-6 audit gap (the doc says force-merges write to
# `structure_events`, but that table lives in the platform DB, not
# Gitea-side; Loki is the practical equivalent for Gitea Actions
# events). When the credential / observability stack converges later,
# this can sync into structure_events from Loki via a backfill job —
# the structured JSON shape is forward-compatible.
#
# Closes the §SOP-6 audit gap for the molecule-core repo. RFC:
# internal#219 §6. Mirrors the same-named workflow in
# molecule-controlplane; design rationale lives in the RFC, not here,
# to keep the workflow file scannable.
# Logic in `.gitea/scripts/audit-force-merge.sh` per the same script-
# extract pattern as sop-tier-check.
name: audit-force-merge
# pull_request_target loads from the base branch — same security model
# as sop-tier-check. Without this, a PR author could rewrite the
# workflow on their own PR and skip the audit emission for their own
# force-merge. The base-branch checkout below ALSO uses
# `base.sha`, not `base.ref`, so a fast-moving base can't slip a
# different audit script in under us.
# as sop-tier-check. Without this, an attacker could rewrite the
# workflow on a PR and skip the audit emission for their own
# force-merge. See `.gitea/workflows/sop-tier-check.yml` for the full
# rationale.
on:
pull_request_target:
types: [closed]
# `pull-requests: read` + `contents: read` covers everything the script
# needs (fetch PR + commit statuses). `issues:` deliberately omitted —
# audit fires-and-forgets to stdout, never opens issues.
permissions:
contents: read
pull-requests: read
jobs:
audit:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
# Skip when PR is closed without merge — saves a runner.
if: github.event.pull_request.merged == true
steps:
- name: Check out base branch (for the script)
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
# base.sha pinning, NOT base.ref — see header rationale.
ref: ${{ github.event.pull_request.base.sha }}
- name: Detect force-merge + emit audit event
env:
# Same org-level secret the sop-tier-check workflow uses;
# falls back to the auto-injected GITHUB_TOKEN if the
# org-level SOP_TIER_CHECK_TOKEN isn't set on a transitional
# repo.
# Same org-level secret the sop-tier-check workflow uses.
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
GITEA_HOST: git.moleculesai.app
REPO: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
# Required-status-check contexts to evaluate at merge time.
# Newline-separated. MUST mirror branch protection's
# status_check_contexts for protected branches
# (currently `main`; `staging` protection forthcoming per
# RFC internal#219 Phase 4).
#
# Initialized 2026-05-11 from the current molecule-core `main`
# branch protection:
#
# GET /api/v1/repos/molecule-ai/molecule-core/
# branch_protections/main
# → status_check_contexts = [
# "Secret scan / Scan diff for credential-shaped strings (pull_request)",
# "sop-tier-check / tier-check (pull_request)"
# ]
#
# Newline-separated. Mirror this against branch protection
# (settings → branches → protected branch → required checks).
# Declared here rather than fetched from /branch_protections
# because that endpoint requires admin write — sop-tier-bot
# is read-only by design (least-privilege per
# `feedback_least_privilege_via_workflow_env` / internal#257).
# Drift between this env and the real protection list is
# auto-detected by `ci-required-drift.yml` (RFC §4 + §6),
# which opens a `[ci-drift]` issue within one hour.
#
# When the protection set changes (e.g. Phase 4 adds the
# `ci / all-required (pull_request)` sentinel), update BOTH
# branch protection AND this env in the SAME PR; drift-detect
# will otherwise file an issue for you.
# because that endpoint requires admin write — sop-tier-bot is
# read-only by design (least-privilege).
REQUIRED_CHECKS: |
Secret scan / Scan diff for credential-shaped strings (pull_request)
sop-tier-check / tier-check (pull_request)
Secret scan / Scan diff for credential-shaped strings (pull_request)
run: bash .gitea/scripts/audit-force-merge.sh

View File

@ -1,148 +0,0 @@
name: Block internal-flavored paths
# Ported from .github/workflows/block-internal-paths.yml on 2026-05-11 per
# RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - Dropped `merge_group: { types: [checks_requested] }` (Gitea has no
# merge queue; no `gh-readonly-queue/...` refs).
# - Workflow-level env.GITHUB_SERVER_URL set per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on the job (RFC §1 contract — surface
# defects without blocking; follow-up PR flips after triage).
#
# Hard CI gate. Internal content (positioning, competitive briefs, sales
# playbooks, PMM/press drip, draft campaigns) lives in molecule-ai/internal —
# this public monorepo must never re-acquire those paths. CEO directive
# 2026-04-23 after a fleet-wide audit found 79 internal files leaked here.
#
# Failure mode without this gate: agents (PMM, Research, DevRel, Sales) drop
# briefs into the easiest path their cwd resolves to (root /research,
# /marketing, /docs/marketing) and gitignore alone won't catch a `git add -f`
# or a stale gitignore line. This workflow is the mechanical backstop.
on:
pull_request:
types: [opened, synchronize, reopened]
push:
branches: [main, staging]
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
check:
name: Block forbidden paths
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 2 # need previous commit to diff against on push events
# For pull_request events the diff base is github.event.pull_request.base.sha,
# which may be many commits behind HEAD and therefore absent from the
# shallow clone above. Fetch it explicitly (depth=1 keeps it fast).
- name: Fetch PR base SHA (pull_request events only)
if: github.event_name == 'pull_request'
run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
- name: Refuse if forbidden paths appear
env:
# Plumb event-specific SHAs through env so the script doesn't
# need conditional `${{ ... }}` interpolation per event type.
# github.event.before/after only exist on push events;
# pull_request has pull_request.base.sha / pull_request.head.sha.
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
PUSH_BEFORE: ${{ github.event.before }}
PUSH_AFTER: ${{ github.event.after }}
run: |
# Paths that must NEVER live in the public monorepo. Add to this
# list narrowly — broader patterns belong in .gitignore so day-to-day
# docs work isn't accidentally blocked.
FORBIDDEN_PATTERNS=(
"^research/"
"^marketing/"
"^docs/marketing/"
"^comment-[0-9]+\.json$"
"^test-pmm.*\.(txt|md)$"
"^tick-reflections.*\.(txt|md)$"
".*-temp\.(md|txt)$"
)
# Determine the diff base. Each event type stores its SHAs in
# a different place — see the env block above.
case "${{ github.event_name }}" in
pull_request)
BASE="$PR_BASE_SHA"
HEAD="$PR_HEAD_SHA"
;;
*)
BASE="$PUSH_BEFORE"
HEAD="$PUSH_AFTER"
;;
esac
# On push events with shallow clones, BASE may be present in
# the event payload but absent from the local object DB
# (fetch-depth=2 doesn't always reach the previous commit
# across true merges). Try fetching it on demand. If the
# fetch fails — e.g. the SHA was force-overwritten — we fall
# through to the empty-BASE branch below, which scans the
# entire tree as if every file were new. Correct, just slow.
if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
if ! git cat-file -e "$BASE" 2>/dev/null; then
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
fi
fi
# Files added or modified in this change.
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
# New branch / no previous SHA / BASE unreachable — check
# the entire tree as if every file were new. Slower but
# correct on first push or post-fetch-failure recovery.
CHANGED=$(git ls-tree -r --name-only HEAD)
else
CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
fi
if [ -z "$CHANGED" ]; then
echo "No changed files to inspect."
exit 0
fi
OFFENDING=""
for path in $CHANGED; do
for pattern in "${FORBIDDEN_PATTERNS[@]}"; do
if echo "$path" | grep -qE "$pattern"; then
OFFENDING="${OFFENDING}${path} (matched: ${pattern})\n"
break
fi
done
done
if [ -n "$OFFENDING" ]; then
echo "::error::Forbidden internal-flavored paths detected:"
printf "$OFFENDING"
echo ""
echo "These paths belong in molecule-ai/internal, not this public repo."
echo "See docs/internal-content-policy.md for canonical locations."
echo ""
echo "If your file is genuinely public-facing (e.g. a blog post"
echo "ready to ship), use one of these alternatives instead:"
echo " - Public-bound blog posts: docs/blog/<slug>.md"
echo " - Public-bound tutorials: docs/tutorials/<slug>.md"
echo " - Public devrel content: docs/devrel/<slug>.md"
echo ""
echo "If you legitimately need to add a new top-level path that"
echo "happens to match a forbidden pattern, edit"
echo ".gitea/workflows/block-internal-paths.yml and update the"
echo "FORBIDDEN_PATTERNS list with reviewer signoff."
exit 1
fi
echo "OK No forbidden paths in this change."

View File

@ -1,58 +0,0 @@
name: cascade-list-drift-gate
# Ported from .github/workflows/cascade-list-drift-gate.yml on 2026-05-11
# per RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - on.paths reference .gitea/workflows/publish-runtime.yml (the active
# Gitea workflow file) instead of .github/workflows/publish-runtime.yml
# (which Category A of this sweep deletes).
# - Explicit `WORKFLOW=` arg passed to the drift script so it audits the
# .gitea/ workflow (the script's default is still .github/... which
# will not exist post-Cat-A).
# - Workflow-level env.GITHUB_SERVER_URL set per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on the job (RFC §1 contract — surface
# defects without blocking; follow-up PR flips after triage).
#
# Structural gate: TEMPLATES list in publish-runtime.yml must match
# manifest.json's workspace_templates exactly. Closes the recurrence
# path of PR #2556 (the data fix) and is the first concrete deliverable
# of RFC #388 PR-3.
#
# Triggers narrowly to keep CI quiet: only on PRs that actually change
# one of the two files. The path-filtered split + always-emit-result
# pattern (memory: "Required check names need a job that always runs")
# is unnecessary here because the workflow IS the check name and PR
# branch protection should require it directly. Future-proof: if this
# becomes a required check, add a no-op aggregator with always() so the
# name still emits when paths don't match.
on:
pull_request:
branches: [staging, main]
paths:
- manifest.json
- .gitea/workflows/publish-runtime.yml
- scripts/check-cascade-list-vs-manifest.sh
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
permissions:
contents: read
jobs:
check:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
continue-on-error: true
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- name: Check cascade list matches manifest
# Pass the .gitea/ workflow path explicitly — the script's
# default still points at .github/... which Category A of this
# sweep removes.
run: bash scripts/check-cascade-list-vs-manifest.sh manifest.json .gitea/workflows/publish-runtime.yml

View File

@ -1,74 +0,0 @@
name: Check migration collisions
# Ported from .github/workflows/check-migration-collisions.yml on 2026-05-11
# per RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - on.paths includes .gitea/workflows/check-migration-collisions.yml
# (this file) instead of the .github/ one.
# - Workflow-level env.GITHUB_SERVER_URL pinned to https://git.moleculesai.app
# so scripts/ops/check_migration_collisions.py can derive the Gitea API
# base (the script already supports this; see _gitea_api_url()).
# - `continue-on-error: true` on the job (RFC §1 contract).
#
# Hard gate (#2341): fails a PR that adds a migration prefix already
# claimed by the base branch or another open PR. Caught manually 2026-04-30
# during PR #2276 rebase: 044_runtime_image_pins collided with
# 044_platform_inbound_secret from RFC #2312. This workflow makes that
# check automatic.
#
# Trigger model: pull_request only — there's no value running this on
# pushes to staging or main (those are post-merge; the gate must fire
# pre-merge to be useful). Path filter scopes to PRs that actually touch
# migrations.
on:
pull_request:
types: [opened, synchronize, reopened]
paths:
- 'workspace-server/migrations/**'
- 'scripts/ops/check_migration_collisions.py'
- '.gitea/workflows/check-migration-collisions.yml'
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
permissions:
contents: read
# API needs read access to other PRs to detect cross-PR collisions
pull-requests: read
jobs:
check:
name: Migration version collision check
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
continue-on-error: true
timeout-minutes: 5
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
# Need history to diff against base ref
fetch-depth: 0
- name: Detect collisions
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
BASE_REF: origin/${{ github.event.pull_request.base.ref }}
HEAD_REF: ${{ github.event.pull_request.head.sha }}
GITHUB_REPOSITORY: ${{ github.repository }}
# Auto-injected; Gitea aliases this for in-repo API access.
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Ensure the named base ref exists locally. checkout@v4 with
# fetch-depth=0 pulls full history, but the explicit fetch is
# cheap insurance against form-of-ref differences across runs.
#
# IMPORTANT: do NOT pass --depth=1 here. The script below uses
# `git diff origin/<base>...<head>` (three-dot, merge-base form),
# which fails with "fatal: no merge base" if the base ref is
# shallow.
git fetch origin "${{ github.event.pull_request.base.ref }}" || true
python3 scripts/ops/check_migration_collisions.py

View File

@ -1,107 +0,0 @@
# ci-required-drift — hourly sentinel for drift between the canonical
# "what counts as required" sources of truth in this repo:
#
# 1. `.gitea/workflows/ci.yml` jobs (CI source)
# 2. `branch_protections/{main,staging}.status_check_contexts`
# (protection)
# 3. `.gitea/workflows/audit-force-merge.yml` REQUIRED_CHECKS env
# (audit env)
#
# RFC: internal#219 §4 (jobs ↔ protection) + §6 (audit env ↔ protection).
# Ported verbatim-then-adapted from molecule-controlplane PR#112
# (SHA 0adf2098) per RFC internal#219 Phase 2b+c — replicate repo-by-repo.
#
# When any pair diverges, a `[ci-drift]` issue is opened or updated
# (idempotent by title) and labelled `tier:high`. This is the
# auto-detection that closes the regression class identified in
# RFC §1 finding 3 (protection only listed 2 of 6 real jobs for
# ~weeks, undetected) and §6 (audit env drifts silently from
# protection).
#
# Diff logic lives in `.gitea/scripts/ci-required-drift.py`. The
# Python file does YAML AST parsing + `needs:` graph walking per
# `feedback_behavior_based_ast_gates` — NOT grep-by-name. That way
# job renames or matrix-expansion-induced churn produce honest signal.
#
# IMPORTANT — TRANSITIONAL STATE: molecule-core's ci.yml does NOT yet
# contain the `all-required` sentinel job (RFC §4 Phase 4 adds it).
# Until Phase 4 lands the detector will hard-fail with exit 3 on the
# missing sentinel. That's intentional: a red workflow on a 5-min cron
# is louder than a silent issue and forces Phase 4 to land soon.
name: ci-required-drift
# IMPORTANT — Gitea 1.22.6 parser quirk per
# `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
# `inputs:` block here, even though stock GitHub Actions allows it.
# Gitea 1.22.6 flattens `workflow_dispatch.inputs.X` into a sibling of
# the `on:` event keys and rejects the entire workflow as
# "unknown on type". The whole file then registers for ZERO events
# (no schedule, no dispatch). When Gitea ≥ 1.23 lands fleet-wide,
# this constraint can be revisited.
on:
schedule:
# Hourly at :17 — offset from :00 to spread load away from the
# peak when N cron workflows fire on the hour-boundary, per
# RFC §4 cadence ("off-zero").
- cron: '17 * * * *'
workflow_dispatch:
# Read protection + read CI YAML + write issue. No write on contents.
permissions:
contents: read
issues: write
# Serialise — two simultaneous drift runs would duel on the issue
# create/update path. The audit is idempotent, but parallel POSTs
# can produce duplicate comments before the title-search dedup wins.
concurrency:
group: ci-required-drift
cancel-in-progress: false
jobs:
drift:
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Check out repo (we read the YAML files locally)
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python (PyYAML for AST parsing)
# Avoid a system-pip install on the runner; setup-python pins
# a hermetic interpreter + cache. PyYAML is small enough that
# the install is sub-2s — no need to cache wheels.
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
- name: Install PyYAML
run: python -m pip install --quiet 'PyYAML==6.0.2'
- name: Run drift detector
env:
# GITEA_TOKEN reads protection + writes issues. molecule-core
# uses `SOP_TIER_CHECK_TOKEN` as the org-level secret name for
# read-only Gitea API access from CI (set by audit-force-merge
# and sop-tier-check too). Falls back to the auto-injected
# GITHUB_TOKEN if the org-level secret isn't set
# (transitional repos).
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
GITEA_HOST: git.moleculesai.app
REPO: ${{ github.repository }}
# Branches whose protection we compare against. molecule-core
# currently has main protected; staging protection is
# forthcoming. Keep this list in sync if a new long-lived
# branch gets protected (e.g. release/* if introduced later).
BRANCHES: 'main staging'
# The sentinel job's name inside ci.yml. If the aggregator
# is ever renamed, update this too (the drift detector
# currently treats `all-required` as the source of "what
# the sentinel claims to require").
SENTINEL_JOB: 'all-required'
# Path to the audit workflow whose REQUIRED_CHECKS env we
# cross-check against protection (RFC §6).
AUDIT_WORKFLOW_PATH: '.gitea/workflows/audit-force-merge.yml'
# Path to the CI workflow with the sentinel + the jobs.
CI_WORKFLOW_PATH: '.gitea/workflows/ci.yml'
# Issue label applied on file/update. `tier:high` exists in
# the molecule-core label set (verified 2026-05-11, label id 9).
DRIFT_LABEL: 'tier:high'
run: python3 .gitea/scripts/ci-required-drift.py

View File

@ -1,453 +0,0 @@
# Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1.
# continue-on-error: true on every job; follow-up PR will flip required after
# surfaced bugs are fixed (per RFC §1 — "surface broken workflows without
# blocking"). The four-surface migration audit
# (feedback_gitea_actions_migration_audit_pattern) was performed against this
# port:
#
# 1. YAML — dropped `merge_group` trigger (no Gitea merge queue); no
# `workflow_dispatch.inputs` to drop (Gitea 1.22.6 rejects those —
# feedback_gitea_workflow_dispatch_inputs_unsupported); no `environment:`
# blocks; kept `runs-on: ubuntu-latest` (Gitea runner pool advertises
# this label per agent_labels in action_runner table). Workflow-level
# env.GITHUB_SERVER_URL set as belt-and-suspenders against runner
# defaults (feedback_act_runner_github_server_url).
#
# 2. Cache — `actions/upload-artifact@v3.2.2` was already pinned to v3 for
# Gitea act_runner v0.6 compatibility (a comment in the original called
# this out). v4+ is incompatible with Gitea 1.22.x. No `actions/cache`
# usage to audit. `actions/setup-python@v6` `cache: pip` is left in
# place — works against Gitea's built-in cache server when runner.cache
# is configured (currently is, /opt/molecule/runners/config.yaml).
#
# 3. Token — workflow uses no custom dispatch tokens. The auto-injected
# `GITHUB_TOKEN` (which Gitea aliases to a runner-scoped token) is
# sufficient for `actions/checkout` against this same repo.
#
# 4. Docs — no docs/scripts reference github.com URLs that need swapping.
# The canvas-deploy-reminder step writes a `ghcr.io/...` image
# reference into the step summary text — that's documentation prose
# pointing at the ECR-mirrored canvas image and stays unchanged for
# this port (a separate cleanup if ghcr→ECR sweep is in scope).
#
# Cross-links:
# - RFC: internal#219 (CI/CD hard-gate hardening)
# - Reference port style: molecule-controlplane/.gitea/workflows/ci.yml
# - Bugs that may surface immediately and are tracked separately:
# internal#214 (Go-side vanity-import / go.sum drift, if any)
# - Phase 4 (this PR's follow-up): flip `continue-on-error: false` once
# surfaced defects are fixed, then add `all-required` aggregator
# sentinel (RFC §2) and PATCH branch protection (Phase 4 scope).
name: CI
on:
push:
branches: [main, staging]
pull_request:
branches: [main, staging]
# `merge_group` (GitHub merge-queue trigger) dropped — Gitea has no merge
# queue. The .github/ original retains it; this Gitea-side copy drops it.
# Cancel in-progress CI runs when a new commit arrives on the same ref.
# Stale runs queue up otherwise. PR refs and main/staging refs each get
# their own group because github.ref differs.
concurrency:
group: ci-${{ github.ref }}
cancel-in-progress: true
env:
# Belt-and-suspenders against the runner-default trap
# (feedback_act_runner_github_server_url). Runners are configured with
# this env via /opt/molecule/runners/config.yaml runner.envs, but pinning
# at the workflow level protects against a runner regenerated without
# the config file (feedback_act_runner_needs_config_file_env).
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
# Detect which paths changed so downstream jobs can skip when only
# docs/markdown files were modified.
changes:
name: Detect changes
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after the surfaced defects
# (if any) are triaged.
continue-on-error: true
outputs:
platform: ${{ steps.check.outputs.platform }}
canvas: ${{ steps.check.outputs.canvas }}
python: ${{ steps.check.outputs.python }}
scripts: ${{ steps.check.outputs.scripts }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
- id: check
run: |
# For PR events: diff against the base branch (not HEAD~1 of the branch,
# which may be unrelated after force-pushes). When a push updates a PR,
# both pull_request and push events fire — prefer the PR base so that
# the diff is always computed against the actual merge base, not the
# previous SHA on the branch which may be on a different history line.
BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
# GITHUB_BASE_REF is set for PR events (the base branch name).
# For pull_request events we use the stored base.sha; for push events
# (or when base.sha is unavailable) fall back to github.event.before.
if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
BASE="${{ github.event.pull_request.base.sha }}"
fi
# Fallback: if BASE is empty or all zeros (new branch), run everything
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
echo "platform=true" >> "$GITHUB_OUTPUT"
echo "canvas=true" >> "$GITHUB_OUTPUT"
echo "python=true" >> "$GITHUB_OUTPUT"
echo "scripts=true" >> "$GITHUB_OUTPUT"
exit 0
fi
# Both .github/workflows/ci.yml AND .gitea/workflows/ci.yml count
# as "this workflow changed" — either edit should force-run every
# downstream job. The Gitea port follows the same shape as the
# GitHub original so behavior matches when triggered on either
# platform.
DIFF=$(git diff --name-only "$BASE" HEAD 2>/dev/null || echo ".gitea/workflows/ci.yml")
echo "platform=$(echo "$DIFF" | grep -qE '^workspace-server/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
echo "canvas=$(echo "$DIFF" | grep -qE '^canvas/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
echo "python=$(echo "$DIFF" | grep -qE '^workspace/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
echo "scripts=$(echo "$DIFF" | grep -qE '^tests/e2e/|^scripts/|^infra/scripts/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
# Platform (Go) — Go build/vet/test/lint + coverage gates. The always-run
# + per-step gating shape preserves the GitHub-side required-check name
# contract (so when this Gitea port becomes a required check in Phase 4,
# the name match works on PRs that don't touch workspace-server/).
platform-build:
name: Platform (Go)
needs: changes
runs-on: ubuntu-latest
continue-on-error: true
defaults:
run:
working-directory: workspace-server
steps:
- if: needs.changes.outputs.platform != 'true'
working-directory: .
run: echo "No platform/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
- if: needs.changes.outputs.platform == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: needs.changes.outputs.platform == 'true'
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version: 'stable'
- if: needs.changes.outputs.platform == 'true'
run: go mod download
- if: needs.changes.outputs.platform == 'true'
run: go build ./cmd/server
# CLI (molecli) moved to standalone repo: git.moleculesai.app/molecule-ai/molecule-cli
- if: needs.changes.outputs.platform == 'true'
run: go vet ./... || true
- if: needs.changes.outputs.platform == 'true'
name: Run golangci-lint
run: golangci-lint run --timeout 3m ./... || true
- if: needs.changes.outputs.platform == 'true'
name: Run tests with race detection and coverage
run: go test -race -coverprofile=coverage.out ./...
- if: needs.changes.outputs.platform == 'true'
name: Per-file coverage report
# Advisory — lists every source file with its coverage so reviewers
# can see at-a-glance where gaps are. Sorted ascending so the worst
# offenders float to the top. Does NOT fail the build; the hard
# gate is the threshold check below. (#1823)
run: |
echo "=== Per-file coverage (worst first) ==="
go tool cover -func=coverage.out \
| grep -v '^total:' \
| awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
END {for (f in s) printf "%6.1f%% %s\n", s[f]/c[f], f}' \
| sort -n
- if: needs.changes.outputs.platform == 'true'
name: Check coverage thresholds
# Enforces two gates from #1823 Layer 1:
# 1. Total floor (25% — ratchet plan in COVERAGE_FLOOR.md).
# 2. Per-file floor — non-test .go files in security-critical
# paths with coverage <10% fail the build, UNLESS the file
# path is listed in .coverage-allowlist.txt (acknowledged
# historical debt with a tracking issue + expiry).
run: |
set -e
TOTAL_FLOOR=25
# Security-critical paths where a 0%-coverage file is a real risk.
CRITICAL_PATHS=(
"internal/handlers/tokens"
"internal/handlers/workspace_provision"
"internal/handlers/a2a_proxy"
"internal/handlers/registry"
"internal/handlers/secrets"
"internal/middleware/wsauth"
"internal/crypto"
)
TOTAL=$(go tool cover -func=coverage.out | grep '^total:' | awk '{print $3}' | sed 's/%//')
echo "Total coverage: ${TOTAL}%"
if awk "BEGIN{exit !($TOTAL < $TOTAL_FLOOR)}"; then
echo "::error::Total coverage ${TOTAL}% is below the ${TOTAL_FLOOR}% floor. See COVERAGE_FLOOR.md for ratchet plan."
exit 1
fi
# Aggregate per-file coverage → /tmp/perfile.txt: "<fullpath> <pct>"
go tool cover -func=coverage.out \
| grep -v '^total:' \
| awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++}
END {for (f in s) printf "%s %.1f\n", f, s[f]/c[f]}' \
> /tmp/perfile.txt
# Build allowlist — paths relative to workspace-server, one per line.
# Lines starting with # are comments.
ALLOWLIST=""
if [ -f ../.coverage-allowlist.txt ]; then
ALLOWLIST=$(grep -vE '^(#|[[:space:]]*$)' ../.coverage-allowlist.txt || true)
fi
FAILED=0
WARNED=0
for path in "${CRITICAL_PATHS[@]}"; do
while read -r file pct; do
[[ "$file" == *_test.go ]] && continue
[[ "$file" == *"$path"* ]] || continue
awk "BEGIN{exit !($pct < 10)}" || continue
# Strip the package-import prefix so we can match .coverage-allowlist.txt
# entries written as paths relative to workspace-server/.
# Handle both module paths: platform/workspace-server/... and platform/...
rel=$(echo "$file" | sed 's|^github.com/molecule-ai/molecule-monorepo/platform/workspace-server/||; s|^github.com/molecule-ai/molecule-monorepo/platform/||')
if echo "$ALLOWLIST" | grep -qxF "$rel"; then
echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry."
WARNED=$((WARNED+1))
else
echo "::error file=workspace-server/$rel::Critical file at ${pct}% coverage — must be >=10% (target 80%). See #1823. To acknowledge as known debt, add this path to .coverage-allowlist.txt."
FAILED=$((FAILED+1))
fi
done < /tmp/perfile.txt
done
echo ""
echo "Critical-path check: $FAILED new failures, $WARNED allowlisted warnings."
if [ "$FAILED" -gt 0 ]; then
echo ""
echo "$FAILED security-critical file(s) have <10% test coverage and are"
echo "NOT in the allowlist. These paths handle auth, tokens, secrets, or"
echo "workspace provisioning — a 0% file here is the exact gap that let"
echo "CWE-22, CWE-78, KI-005 slip through in past incidents. Either:"
echo " (a) add tests to raise coverage above 10%, or"
echo " (b) add the path to .coverage-allowlist.txt with an expiry date"
echo " and a tracking issue reference."
exit 1
fi
# Canvas (Next.js) — required check, always runs. Same always-run +
# per-step gating shape as platform-build. The two-job-sharing-name
# pattern attempted in PR #2321 doesn't satisfy branch protection
# (SKIPPED siblings count as not-passed regardless of SUCCESS
# siblings — verified empirically on PR #2314).
canvas-build:
name: Canvas (Next.js)
needs: changes
runs-on: ubuntu-latest
continue-on-error: true
defaults:
run:
working-directory: canvas
steps:
- if: needs.changes.outputs.canvas != 'true'
working-directory: .
run: echo "No canvas/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
- if: needs.changes.outputs.canvas == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: needs.changes.outputs.canvas == 'true'
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: '22'
- if: needs.changes.outputs.canvas == 'true'
run: rm -f package-lock.json && npm install
- if: needs.changes.outputs.canvas == 'true'
run: npm run build
- if: needs.changes.outputs.canvas == 'true'
name: Run tests with coverage
# Coverage instrumentation is configured in canvas/vitest.config.ts
# (provider: v8, reporters: text + html + json-summary). Step 2 of
# #1815 — wires coverage into CI so we get a baseline visible on
# every PR. No threshold gate yet; thresholds dial in (Step 3, also
# tracked in #1815) after the team sees what current coverage is.
run: npx vitest run --coverage
- name: Upload coverage summary as artifact
if: needs.changes.outputs.canvas == 'true' && always()
# Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
# the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
# implement, surfacing as `GHESNotSupportedError: @actions/artifact
# v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not
# currently supported on GHES`. Drop this pin when Gitea ships
# the v4 protocol (tracked: post-Gitea-1.23 followup).
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
with:
name: canvas-coverage-${{ github.run_id }}
path: canvas/coverage/
retention-days: 7
if-no-files-found: warn
# Shellcheck (E2E scripts) — required check, always runs.
shellcheck:
name: Shellcheck (E2E scripts)
needs: changes
runs-on: ubuntu-latest
continue-on-error: true
steps:
- if: needs.changes.outputs.scripts != 'true'
run: echo "No tests/e2e/ or infra/scripts/ changes — skipping real shellcheck; this job always runs to satisfy the required-check name on branch protection."
- if: needs.changes.outputs.scripts == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: needs.changes.outputs.scripts == 'true'
name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh
# shellcheck is pre-installed on ubuntu-latest runners (via apt).
# infra/scripts/ is included because setup.sh + nuke.sh gate the
# README quickstart — a shellcheck regression there silently breaks
# new-user onboarding. scripts/ is intentionally excluded until its
# pre-existing SC3040/SC3043 warnings are cleaned up.
run: |
find tests/e2e infra/scripts -type f -name '*.sh' -print0 \
| xargs -0 shellcheck --severity=warning
- if: needs.changes.outputs.scripts == 'true'
name: Lint cleanup-trap hygiene (RFC #2873)
run: bash tests/e2e/lint_cleanup_traps.sh
- if: needs.changes.outputs.scripts == 'true'
name: Run E2E bash unit tests (no live infra)
run: |
bash tests/e2e/test_model_slug.sh
canvas-deploy-reminder:
name: Canvas Deploy Reminder
runs-on: ubuntu-latest
continue-on-error: true
needs: [changes, canvas-build]
# Only fires on direct pushes to main (i.e. after staging→main promotion).
if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- name: Write deploy reminder to step summary
env:
COMMIT_SHA: ${{ github.sha }}
# github.server_url resolves via the workflow-level env override
# to the Gitea instance, so the RUN_URL points at the Gitea run
# page (not github.com). See feedback_act_runner_github_server_url.
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
# Write body to a temp file — avoids backtick escaping in shell.
cat > /tmp/deploy-reminder.md << 'BODY'
## Canvas build passed — deploy required
The `publish-canvas-image` workflow is now building a fresh Docker image
(`ghcr.io/molecule-ai/canvas:latest`) in the background.
Once it completes (~35 min), apply on the host machine with:
```bash
cd <runner-workspace>
git pull origin main
docker compose pull canvas && docker compose up -d canvas
```
If you need to rebuild from local source instead (e.g. testing unreleased
changes or a new `NEXT_PUBLIC_*` URL), use:
```bash
docker compose build canvas && docker compose up -d canvas
```
BODY
printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
"$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
# Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY,
# which both GitHub Actions and Gitea Actions render as the
# workflow run's summary page. (#75 / PR-D)
cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
# Python Lint & Test — required check, always runs.
python-lint:
name: Python Lint & Test
needs: changes
runs-on: ubuntu-latest
continue-on-error: true
env:
WORKSPACE_ID: test
defaults:
run:
working-directory: workspace
steps:
- if: needs.changes.outputs.python != 'true'
working-directory: .
run: echo "No workspace/** changes — skipping real lint+test; this job always runs to satisfy the required-check name on branch protection."
- if: needs.changes.outputs.python == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: needs.changes.outputs.python == 'true'
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.11'
cache: pip
cache-dependency-path: workspace/requirements.txt
- if: needs.changes.outputs.python == 'true'
run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0
# Coverage flags + fail-under floor moved into workspace/pytest.ini
# (issue #1817) so local `pytest` and CI use identical config.
- if: needs.changes.outputs.python == 'true'
run: python -m pytest --tb=short
- if: needs.changes.outputs.python == 'true'
name: Per-file critical-path coverage (MCP / inbox / auth)
# MCP-critical Python files have a per-file floor on top of the
# 86% total floor in pytest.ini. See issue #2790 for full rationale.
run: |
set -e
PER_FILE_FLOOR=75
CRITICAL_FILES=(
"a2a_mcp_server.py"
"mcp_cli.py"
"a2a_tools.py"
"a2a_tools_inbox.py"
"inbox.py"
"platform_auth.py"
)
# pytest already wrote .coverage; emit a JSON view scoped to
# the critical files so jq/python can read the per-file pct
# without parsing tabular text.
INCLUDES=$(printf '*%s,' "${CRITICAL_FILES[@]}")
INCLUDES="${INCLUDES%,}"
python -m coverage json -o /tmp/critical-cov.json --include="$INCLUDES"
FAILED=0
for f in "${CRITICAL_FILES[@]}"; do
pct=$(jq -r --arg f "$f" '.files | to_entries | map(select(.key == $f)) | .[0].value.summary.percent_covered // "MISSING"' /tmp/critical-cov.json)
if [ "$pct" = "MISSING" ]; then
echo "::error file=workspace/$f::No coverage data — file may have moved or test exclusion mis-set."
FAILED=$((FAILED+1))
continue
fi
echo "$f: ${pct}%"
if awk "BEGIN{exit !($pct < $PER_FILE_FLOOR)}"; then
echo "::error file=workspace/$f::${pct}% < ${PER_FILE_FLOOR}% per-file floor (MCP critical path). See COVERAGE_FLOOR.md."
FAILED=$((FAILED+1))
fi
done
if [ "$FAILED" -gt 0 ]; then
echo ""
echo "$FAILED MCP critical-path file(s) below the ${PER_FILE_FLOOR}% per-file floor."
echo "These paths handle multi-tenant routing, auth tokens, and inbox dispatch."
echo "A coverage drop here is the same risk shape as Go-side tokens/secrets files"
echo "dropping below 10% (see COVERAGE_FLOOR.md). Either:"
echo " (a) add tests to raise coverage back above ${PER_FILE_FLOOR}%, or"
echo " (b) if this is unavoidable historical debt, file an issue and propose"
echo " adjusting the floor with rationale in COVERAGE_FLOOR.md."
exit 1
fi

View File

@ -1,255 +0,0 @@
name: Continuous synthetic E2E (staging)
# Ported from .github/workflows/continuous-synth-e2e.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Hard gate (#2342): cron-driven full-lifecycle E2E that catches
# regressions visible only at runtime — schema drift, deployment-pipeline
# gaps, vendor outages, env-var rotations, DNS / CF / Railway side-effects.
#
# Why this gate exists:
# PR-time CI catches code-level regressions but not deployment-time or
# integration-time ones. Today's empirical data:
# • #2345 (A2A v0.2 silent drop) — passed all unit tests, broke at
# JSON-RPC parse layer between sender and receiver. Visible only
# to a sender exercising the full path.
# • RFC #2312 chat upload — landed on staging-branch but never
# reached staging tenants because publish-workspace-server-image
# was main-only. Caught by manual dogfooding hours after deploy.
# Both would have surfaced within 15-20 min of regression if a
# continuous synth-E2E was running.
#
# Cadence: every 20 min (3x/hour). The script is conservatively
# bounded at 10 min wall-clock; even on degraded staging it should
# finish before the next firing. cron-overlap is guarded by the
# concurrency group below.
#
# Cost: ~3 runs/hour × 5-10 min × $0.008/min GHA = ~$0.50-$1/day.
# Plus a fresh tenant provisioned + torn down each run (Railway +
# AWS pennies). Negligible.
#
# Failure handling: when the run fails, the workflow exits non-zero
# and GitHub's standard email/notification path fires. Operators
# can subscribe to this workflow's failure channel for paging-grade
# alerting.
on:
schedule:
# Every 10 minutes, on :02 :12 :22 :32 :42 :52. Three constraints:
# 1. Stay off the top-of-hour. GitHub Actions scheduler drops
# :00 firings under high load (own docs:
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule).
# Prior history: cron was '0,20,40' (2026-05-02) — only :00
# ever survived. Bumped to '10,30,50' (2026-05-03) on the
# theory that further-from-:00 wins. Empirically 2026-05-04
# that ALSO dropped to ~60 min effective cadence (only ~1
# schedule fire per hour — see molecule-core#2726). Detection
# latency was claimed 20 min, actual 60 min.
# 2. Avoid colliding with the existing :15 sweep-cf-orphans
# and :45 sweep-cf-tunnels — both hit the CF API and we
# don't want to fight for rate-limit tokens.
# 3. Avoid the :30 heavy slot (staging-smoke /30, sweep-aws-
# secrets, sweep-stale-e2e-orgs every :15) — multiple
# overlapping cron registrations on the same minute is part
# of what GH drops under load.
# Solution: bump fires-per-hour 3 → 6 AND keep all slots in clean
# lanes (1-3 min away from any other cron). Even with empirically-
# observed ~67% GH drop ratio, 6 attempts/hour yields ~2 effective
# fires = ~30 min cadence; closer to the 20-min target than the
# current shape and provides a real degradation alarm if drops
# get worse.
- cron: '2,12,22,32,42,52 * * * *'
permissions:
contents: read
# No issue-write here — failures surface as red runs in the workflow
# history. If you want auto-issue-on-fail, add a follow-up step that
# uses gh issue create gated on `if: failure()`. Keeping the surface
# minimal until that's actually wanted.
# Serialize so two firings can never overlap. Cron firing every 20 min
# but scripts conservatively bounded at 10 min — overlap shouldn't
# happen in steady state, but if a run hangs we don't want N more
# stacking up.
concurrency:
group: continuous-synth-e2e
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
synth:
name: Synthetic E2E against staging
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
# Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
# (apt-get update + install docker.io/jq/awscli/caddy + snap install
# ssm-agent) runs from raw Ubuntu on every boot — none of it is
# pre-baked into the tenant AMI. Empirical fetch_secrets/ok timing
# across today's canaries: 51s → 82s → 143s → 625s. apt-mirror tail
# latency drives the boot-to-fetch_secrets phase from ~1min to >10min.
# A 12min budget leaves only ~2min for the workspace (which needs
# ~3.5min for claude-code cold boot) on slow-apt days, blowing the
# budget. 20min absorbs the worst tenant tail so the workspace probe
# gets the full ~7min it needs even on a slow apt day. Real fix:
# pre-bake caddy + ssm-agent into the tenant AMI (controlplane#TBD).
timeout-minutes: 20
env:
# claude-code default: cold-start ~5 min (comparable to langgraph),
# but uses MiniMax-M2.7-highspeed via the template's third-party-
# Anthropic-compat path (workspace-configs-templates/claude-code-
# default/config.yaml:64-69). MiniMax is ~5-10x cheaper than
# gpt-4.1-mini per token AND avoids the recurring OpenAI quota-
# exhaustion class that took the canary down 2026-05-03 (#265).
# Operators can pick langgraph / hermes via workflow_dispatch
# when they specifically need to exercise the OpenAI or SDK-
# native paths.
E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
# Pin the canary to a specific MiniMax model rather than relying
# on the per-runtime default ("sonnet" → routes to direct
# Anthropic, defeats the cost saving). Operators can override
# via workflow_dispatch by setting a different E2E_MODEL_SLUG
# input if they need to exercise a specific model. M2.7-highspeed
# is "Token Plan only" but cheap-per-token and fast.
E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7-highspeed' }}
# Bound to 10 min so a stuck provision fails the run instead of
# holding up the next cron firing. 15-min default in the script
# is for the on-PR full lifecycle where we have more headroom.
E2E_PROVISION_TIMEOUT_SECS: '600'
# Slug suffix — namespaced "synth-" so these runs are
# distinguishable from PR-driven runs in CP admin.
E2E_RUN_ID: synth-${{ github.run_id }}
# Forced false for cron; respected for manual dispatch
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
# MiniMax key is the canary's PRIMARY auth path. claude-code
# template's `minimax` provider routes ANTHROPIC_BASE_URL to
# api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot.
# tests/e2e/test_staging_full_saas.sh branches SECRETS_JSON on
# which key is present — MiniMax wins when set.
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
# Direct-Anthropic alternative for operators who don't want to
# set up a MiniMax account (priority below MiniMax — first
# non-empty wins in test_staging_full_saas.sh's secrets-injection
# block). See #2578 PR comment for the rationale.
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
# OpenAI fallback — kept wired so operators can dispatch with
# E2E_RUNTIME=langgraph or =hermes and still have a working
# canary path. The script picks the right blob shape based on
# which key is non-empty.
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify required secrets present
run: |
# Hard-fail on missing secret REGARDLESS of trigger. Previously
# this step soft-skipped on workflow_dispatch via `exit 0`, but
# `exit 0` only ends the STEP — subsequent steps still ran with
# the empty secret, the synth script fell through to the wrong
# SECRETS_JSON branch, and the canary failed 5 min later with a
# confusing "Agent error (Exception)" instead of the clean
# "secret missing" message at the top. Caught 2026-05-04 by
# dispatched run 25296530706: claude-code + missing MINIMAX
# silently used OpenAI keys but kept model=MiniMax-M2.7, then
# the workspace 401'd against MiniMax once it tried to call.
# Fix: exit 1 in both cron and dispatch paths. Operators who
# want to verify a YAML change without setting up the secret
# can read the verify-secrets step's stderr — the failure is
# itself the verification signal.
if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — synth E2E cannot run"
echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
exit 1
fi
# LLM-key requirement is per-runtime: claude-code accepts
# EITHER MiniMax OR direct-Anthropic (whichever is set first),
# langgraph + hermes use OpenAI (MOLECULE_STAGING_OPENAI_API_KEY).
case "${E2E_RUNTIME}" in
claude-code)
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
required_secret_value="${E2E_MINIMAX_API_KEY}"
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
required_secret_value="${E2E_ANTHROPIC_API_KEY}"
else
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
required_secret_value=""
fi
;;
langgraph|hermes)
required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
required_secret_value="${E2E_OPENAI_API_KEY:-}"
;;
*)
echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
required_secret_name=""
required_secret_value="present"
;;
esac
if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
echo "::error::${required_secret_name} secret missing — runtime=${E2E_RUNTIME} cannot authenticate against its LLM provider"
echo "::error::Set it at Settings → Secrets and Variables → Actions, OR dispatch with a different runtime"
exit 1
fi
- name: Install required tools
run: |
# The script depends on jq + curl (already on ubuntu-latest)
# and python3 (likewise). Verify they're all present so we
# fail fast on a runner image regression rather than mid-script.
for cmd in jq curl python3; do
command -v "$cmd" >/dev/null 2>&1 || {
echo "::error::required tool '$cmd' not on PATH — runner image regression?"
exit 1
}
done
- name: Run synthetic E2E
# The script handles its own teardown via EXIT trap; even on
# failure (timeout, assertion), the org is deprovisioned and
# leaks are reported. Exit code propagates from the script.
run: |
bash tests/e2e/test_staging_full_saas.sh
- name: Failure summary
# Runs only on failure. Adds a job summary so the workflow run
# page shows a quick "what happened" instead of forcing readers
# to scroll through script output.
if: failure()
run: |
{
echo "## Continuous synth E2E failed"
echo ""
echo "**Run ID:** ${{ github.run_id }}"
echo "**Trigger:** ${{ github.event_name }}"
echo "**Runtime:** ${E2E_RUNTIME}"
echo "**Slug:** synth-${{ github.run_id }}"
echo ""
echo "### What this means"
echo ""
echo "Staging just regressed on a path that previously worked. Likely classes:"
echo "- Schema mismatch between sender and receiver (#2345 class)"
echo "- Deployment-pipeline gap (RFC #2312 / staging-tenant-image-stale class)"
echo "- Vendor outage (Cloudflare, Railway, AWS, GHCR)"
echo "- Staging-CP env var rotation"
echo ""
echo "### Next steps"
echo ""
echo "1. Check the script output above for the assertion that failed"
echo "2. If it's a vendor outage, no action needed — next firing in ~20 min"
echo "3. If it's a code regression, find the causing PR via \`git log\` against last green run and revert/fix"
echo "4. Keep an eye on the next 1-2 firings — flake vs persistent fail differs in priority"
} >> "$GITHUB_STEP_SUMMARY"

View File

@ -1,333 +0,0 @@
name: E2E API Smoke Test
# Ported from .github/workflows/e2e-api.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Extracted from ci.yml so workflow-level concurrency can protect this job
# from run-level cancellation (issue #458).
#
# Trigger model (revised 2026-04-29):
#
# Always FIRES on push/pull_request to staging+main. Real work is gated
# per-step on `needs.detect-changes.outputs.api` — when paths under
# `workspace-server/`, `tests/e2e/`, or this workflow file haven't
# changed, the no-op step alone runs and emits SUCCESS for the
# `E2E API Smoke Test` check, satisfying branch protection without
# spending CI cycles. See the in-job comment on the `e2e-api` job for
# why this is one job (not two-jobs-sharing-name) and the 2026-04-29
# PR #2264 incident that drove the consolidation.
#
# Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
# -------------------------------------------------------------------
# Same substrate hazard as PR #98 (handlers-postgres-integration). Our
# Gitea act_runner runs with `container.network: host` (operator host
# `/opt/molecule/runners/config.yaml`), which means:
#
# * Two concurrent runs both try to bind their `-p 15432:5432` /
# `-p 16379:6379` host ports — the second postgres/redis FATALs
# with `Address in use` and `docker run` returns exit 125 with
# `Conflict. The container name "/molecule-ci-postgres" is already
# in use by container ...`. Verified in run a7/2727 on 2026-05-07.
# * The fixed container names `molecule-ci-postgres` / `-redis` (the
# pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
# `docker rm -f` at the start of the second job KILLS the first
# job's still-running postgres/redis.
#
# Fix shape (mirrors PR #98's bridge-net pattern, adapted because
# platform-server is a Go binary on the host, not a containerised
# step):
#
# 1. Unique container names per run:
# pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
# redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
# `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
# same run_id.
# 2. Ephemeral host port per run (`-p 0:5432`), then read the actual
# bound port via `docker port` and export DATABASE_URL/REDIS_URL
# pointing at it. No fixed host-port → no port collision.
# 3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
# the original flake fixed in #92 and the script's still IPv6-
# enabled.
# 4. `if: always()` cleanup so containers don't leak when test steps
# fail.
#
# Issue #94 items #2 + #3 (also fixed here):
# * Pre-pull `alpine:latest` so the platform-server's provisioner
# (`internal/handlers/container_files.go`) can stand up its
# ephemeral token-write helper without a daemon.io round-trip.
# * Create `molecule-core-net` bridge network if missing so the
# provisioner's container.HostConfig {NetworkMode: ...} attach
# succeeds.
# Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
# 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
# they DO come up. Timeouts are not the bottleneck; not bumped.
#
# Item explicitly NOT fixed here: failing test `Status back online`
# fails because the platform's langgraph workspace template image
# (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
# 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
# template-registry resolution issue (ADR-002 / local-build mode) and
# belongs in a separate change that touches workspace-server, not
# this workflow file.
on:
push:
branches: [main, staging]
pull_request:
branches: [main, staging]
concurrency:
# Per-SHA grouping (changed 2026-04-28 from per-ref). Per-ref had the
# same auto-promote-staging brittleness as e2e-staging-canvas — back-
# to-back staging pushes share refs/heads/staging, so the older push's
# queued run gets cancelled when a newer push lands. Auto-promote-
# staging then sees `completed/cancelled` for the older SHA and stays
# put; the newer SHA's gates may eventually save the day, but if the
# newer push gets cancelled too, we deadlock.
#
# See e2e-staging-canvas.yml's identical concurrency block for the full
# rationale and the 2026-04-28 incident reference.
group: e2e-api-${{ github.event.pull_request.head.sha || github.sha }}
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
outputs:
api: ${{ steps.decide.outputs.api }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
- id: decide
# Inline replacement for dorny/paths-filter — same pattern PR#372's
# ci.yml port used. Diffs against the PR base or push BEFORE SHA,
# then matches against the api-relevant path set.
run: |
BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
BASE="${{ github.event.pull_request.base.sha }}"
fi
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
echo "api=true" >> "$GITHUB_OUTPUT"
exit 0
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
echo "api=true" >> "$GITHUB_OUTPUT"
exit 0
fi
CHANGED=$(git diff --name-only "$BASE" HEAD)
if echo "$CHANGED" | grep -qE '^(workspace-server/|tests/e2e/|\.gitea/workflows/e2e-api\.yml$)'; then
echo "api=true" >> "$GITHUB_OUTPUT"
else
echo "api=false" >> "$GITHUB_OUTPUT"
fi
# ONE job (no job-level `if:`) that always runs and reports under the
# required-check name `E2E API Smoke Test`. Real work is gated per-step
# on `needs.detect-changes.outputs.api`. Reason: GitHub registers a
# check run for every job that matches `name:`, and a job-level
# `if: false` produces a SKIPPED check run. Branch protection treats
# all check runs with a matching context name on the latest commit as a
# SET — any SKIPPED in the set fails the required-check eval, even with
# SUCCESS siblings. Verified 2026-04-29 on PR #2264 (staging→main):
# 4 check runs (2 SKIPPED + 2 SUCCESS) at the head SHA blocked
# promotion despite all real work succeeding. Collapsing to a single
# always-running job with conditional steps emits exactly one SUCCESS
# check run regardless of paths filter — branch-protection-clean.
e2e-api:
needs: detect-changes
name: E2E API Smoke Test
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 15
env:
# Unique per-run container names so concurrent runs on the host-
# network act_runner don't collide on name OR port.
# `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
# same run_id. PORT is set later (after docker port lookup) since
# we let Docker assign an ephemeral host port.
PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
PORT: "8080"
steps:
- name: No-op pass (paths filter excluded this commit)
if: needs.detect-changes.outputs.api != 'true'
run: |
echo "No workspace-server / tests/e2e / workflow changes — E2E API gate satisfied without running tests."
echo "::notice::E2E API Smoke Test no-op pass (paths filter excluded this commit)."
- if: needs.detect-changes.outputs.api == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: needs.detect-changes.outputs.api == 'true'
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version: 'stable'
cache: true
cache-dependency-path: workspace-server/go.sum
- name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
if: needs.detect-changes.outputs.api == 'true'
run: |
# Provisioner uses alpine:latest for ephemeral token-write
# containers (workspace-server/internal/handlers/container_files.go).
# Pre-pull so the first provision in test_api.sh doesn't race
# the daemon's pull cache. Idempotent — `docker pull` is a no-op
# when the image is already present.
docker pull alpine:latest >/dev/null
# Provisioner attaches workspace containers to
# molecule-core-net (workspace-server/internal/provisioner/
# provisioner.go::DefaultNetwork). The bridge already exists on
# the operator host's docker daemon — `network create` is
# idempotent via `|| true`.
docker network create molecule-core-net >/dev/null 2>&1 || true
echo "alpine:latest pre-pulled; molecule-core-net ensured."
- name: Start Postgres (docker)
if: needs.detect-changes.outputs.api == 'true'
run: |
# Defensive cleanup — only matches THIS run's container name,
# so it cannot kill a sibling run's postgres. (Pre-fix the
# name was static and this rm hit other runs' containers.)
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
# `-p 0:5432` requests an ephemeral host port; we read it back
# below and export DATABASE_URL.
docker run -d --name "$PG_CONTAINER" \
-e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
-p 0:5432 postgres:16 >/dev/null
# Resolve the host-side port assignment. `docker port` prints
# `0.0.0.0:NNNN` (and on host-net runners may also print an
# IPv6 line — take the first IPv4 line).
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
if [ -z "$PG_PORT" ]; then
# Fallback: any first line. Some Docker versions print only
# one line.
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
fi
if [ -z "$PG_PORT" ]; then
echo "::error::Could not resolve host port for $PG_CONTAINER"
docker port "$PG_CONTAINER" 5432/tcp || true
docker logs "$PG_CONTAINER" || true
exit 1
fi
# 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
echo "Postgres host port: ${PG_PORT}"
for i in $(seq 1 30); do
if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
echo "Postgres ready after ${i}s"
exit 0
fi
sleep 1
done
echo "::error::Postgres did not become ready in 30s"
docker logs "$PG_CONTAINER" || true
exit 1
- name: Start Redis (docker)
if: needs.detect-changes.outputs.api == 'true'
run: |
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
if [ -z "$REDIS_PORT" ]; then
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
fi
if [ -z "$REDIS_PORT" ]; then
echo "::error::Could not resolve host port for $REDIS_CONTAINER"
docker port "$REDIS_CONTAINER" 6379/tcp || true
docker logs "$REDIS_CONTAINER" || true
exit 1
fi
echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
echo "Redis host port: ${REDIS_PORT}"
for i in $(seq 1 15); do
if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
echo "Redis ready after ${i}s"
exit 0
fi
sleep 1
done
echo "::error::Redis did not become ready in 15s"
docker logs "$REDIS_CONTAINER" || true
exit 1
- name: Build platform
if: needs.detect-changes.outputs.api == 'true'
working-directory: workspace-server
run: go build -o platform-server ./cmd/server
- name: Start platform (background)
if: needs.detect-changes.outputs.api == 'true'
working-directory: workspace-server
run: |
# DATABASE_URL + REDIS_URL exported by the start-postgres /
# start-redis steps point at this run's per-run host ports.
./platform-server > platform.log 2>&1 &
echo $! > platform.pid
- name: Wait for /health
if: needs.detect-changes.outputs.api == 'true'
run: |
for i in $(seq 1 30); do
if curl -sf http://127.0.0.1:8080/health > /dev/null; then
echo "Platform up after ${i}s"
exit 0
fi
sleep 1
done
echo "::error::Platform did not become healthy in 30s"
cat workspace-server/platform.log || true
exit 1
- name: Assert migrations applied
if: needs.detect-changes.outputs.api == 'true'
run: |
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'")
if [ "$tables" != "1" ]; then
echo "::error::Migrations did not apply"
cat workspace-server/platform.log || true
exit 1
fi
echo "Migrations OK"
- name: Run E2E API tests
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_api.sh
- name: Run notify-with-attachments E2E
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_notify_attachments_e2e.sh
- name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_priority_runtimes_e2e.sh
- name: Run poll-mode + since_id cursor E2E (#2339)
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_poll_mode_e2e.sh
- name: Run poll-mode chat upload E2E (RFC #2891)
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_poll_mode_chat_upload_e2e.sh
- name: Dump platform log on failure
if: failure() && needs.detect-changes.outputs.api == 'true'
run: cat workspace-server/platform.log || true
- name: Stop platform
if: always() && needs.detect-changes.outputs.api == 'true'
run: |
if [ -f workspace-server/platform.pid ]; then
kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
fi
- name: Stop service containers
# always() so containers don't leak when test steps fail. The
# cleanup is best-effort: if the container is already gone
# (e.g. concurrent rerun race), don't fail the job.
if: always() && needs.detect-changes.outputs.api == 'true'
run: |
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true

View File

@ -1,250 +0,0 @@
name: E2E Staging Canvas (Playwright)
# Ported from .github/workflows/e2e-staging-canvas.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Playwright test suite that provisions a fresh staging org per run and
# verifies every workspace-panel tab renders without crashing. Complements
# e2e-staging-saas.yml (which tests the API shape) by exercising the
# actual browser + canvas bundle against live staging.
#
# Triggers: push to main/staging or PR touching canvas sources + this workflow,
# manual dispatch, and weekly cron to catch browser/runtime drift even
# when canvas is quiet.
# Added staging to push/pull_request branches so the auto-promote gate
# check (--event push --branch staging) can see a completed run for this
# workflow — mirrors what PR #1891 does for e2e-api.yml.
on:
# Trigger model (revised 2026-04-29):
#
# Always fires on push/pull_request; real work is gated per-step on
# `needs.detect-changes.outputs.canvas`. When canvas/ paths haven't
# changed, the no-op step alone runs and emits SUCCESS for the
# `Canvas tabs E2E` check, satisfying branch protection without
# spending CI cycles. See e2e-api.yml for the rationale on why this
# is a single job rather than two-jobs-sharing-name.
push:
branches: [main]
pull_request:
branches: [main]
schedule:
# Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
# release-note-shaped regressions that don't ride in with a PR.
- cron: '0 8 * * 0'
concurrency:
# Per-SHA grouping (changed 2026-04-28 from a single global group). The
# global group made auto-promote-staging brittle: when a staging push
# queued behind an in-flight run and a third entrant (a PR run, a
# follow-on push) entered the group, the staging push got cancelled —
# leaving auto-promote-staging looking at `completed/cancelled` for a
# required gate and refusing to advance main. Observed 2026-04-28
# 23:51-23:53 on staging tip 3f99fede.
#
# The original intent of the global group was to throttle parallel
# E2E provisions (each spins a fresh EC2). At our scale that throttle
# isn't worth the correctness cost — fresh-org-per-run isolates the
# state, and the cost of two parallel runs (~$0.001/min × 10min × 2)
# is rounding error vs. the cost of a stuck pipeline.
#
# Per-SHA still dedupes accidental double-triggers for the SAME SHA.
# It does NOT cancel obsolete-PR-version runs on force-push; that
# wasted CI is acceptable given the alternative is losing staging-tip
# data that auto-promote-staging needs.
group: e2e-staging-canvas-${{ github.event.pull_request.head.sha || github.sha }}
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
outputs:
canvas: ${{ steps.decide.outputs.canvas }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
- id: decide
# Inline replacement for dorny/paths-filter — see e2e-api.yml.
# Cron triggers always run real work (no diff context).
run: |
if [ "${{ github.event_name }}" = "schedule" ]; then
echo "canvas=true" >> "$GITHUB_OUTPUT"
exit 0
fi
BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
BASE="${{ github.event.pull_request.base.sha }}"
fi
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
echo "canvas=true" >> "$GITHUB_OUTPUT"
exit 0
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
echo "canvas=true" >> "$GITHUB_OUTPUT"
exit 0
fi
CHANGED=$(git diff --name-only "$BASE" HEAD)
if echo "$CHANGED" | grep -qE '^(canvas/|\.gitea/workflows/e2e-staging-canvas\.yml$)'; then
echo "canvas=true" >> "$GITHUB_OUTPUT"
else
echo "canvas=false" >> "$GITHUB_OUTPUT"
fi
# ONE job (no job-level `if:`) that always runs and reports under the
# required-check name `Canvas tabs E2E`. Real work is gated per-step on
# `needs.detect-changes.outputs.canvas`. See e2e-api.yml for the full
# rationale — same path-filter check-name parity issue blocked PR #2264
# (staging→main) on 2026-04-29 because branch protection treats matching-
# name check runs as a SET, and any SKIPPED member fails the eval.
playwright:
needs: detect-changes
name: Canvas tabs E2E
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 40
env:
CANVAS_E2E_STAGING: '1'
MOLECULE_CP_URL: https://staging-api.moleculesai.app
# 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
defaults:
run:
working-directory: canvas
steps:
- name: No-op pass (paths filter excluded this commit)
if: needs.detect-changes.outputs.canvas != 'true'
working-directory: .
run: |
echo "No canvas / workflow changes — E2E Staging Canvas gate satisfied without running tests."
echo "::notice::E2E Staging Canvas no-op pass (paths filter excluded this commit)."
- if: needs.detect-changes.outputs.canvas == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify admin token present
if: needs.detect-changes.outputs.canvas == 'true'
run: |
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
echo "::error::Missing CP_STAGING_ADMIN_API_TOKEN"
exit 2
fi
- name: Set up Node
if: needs.detect-changes.outputs.canvas == 'true'
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: '20'
cache: 'npm'
cache-dependency-path: canvas/package-lock.json
- name: Install canvas deps
if: needs.detect-changes.outputs.canvas == 'true'
run: npm ci
- name: Install Playwright browsers
if: needs.detect-changes.outputs.canvas == 'true'
run: npx playwright install --with-deps chromium
- name: Run staging canvas E2E
if: needs.detect-changes.outputs.canvas == 'true'
run: npx playwright test --config=playwright.staging.config.ts
- name: Upload Playwright report on failure
if: failure() && needs.detect-changes.outputs.canvas == 'true'
# Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
# the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
# implement (see ci.yml upload step for the canonical error
# cite). Drop this pin when Gitea ships the v4 protocol.
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
with:
name: playwright-report-staging
path: canvas/playwright-report-staging/
retention-days: 14
- name: Upload screenshots on failure
if: failure() && needs.detect-changes.outputs.canvas == 'true'
# Pinned to v3 for Gitea act_runner v0.6 compatibility (see above).
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
with:
name: playwright-screenshots
path: canvas/test-results/
retention-days: 14
# Safety-net teardown — fires only when Playwright's globalTeardown
# didn't (worker crash, runner cancel). Reads the slug from
# canvas/.playwright-staging-state.json (written by staging-setup
# as its first action, before any CP call) and deletes only that
# slug.
#
# Earlier versions of this step pattern-swept `e2e-canvas-<today>-*`
# orgs to compensate for setup-crash-before-state-file-write. That
# over-aggressive cleanup raced concurrent canvas-E2E runs and
# poisoned each other's tenants — observed 2026-04-30 when three
# real-test runs killed each other mid-test, surfacing as
# `getaddrinfo ENOTFOUND` once CP had cleaned up the just-deleted
# DNS record. Pattern-sweep removed; setup now writes the state
# file before any CP work, so the slug is always recoverable.
- name: Teardown safety net
if: always() && needs.detect-changes.outputs.canvas == 'true'
env:
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
run: |
set +e
STATE_FILE=".playwright-staging-state.json"
if [ ! -f "$STATE_FILE" ]; then
echo "::notice::No state file at canvas/$STATE_FILE — Playwright globalTeardown handled it (or setup never ran)."
exit 0
fi
slug=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('slug',''))")
if [ -z "$slug" ]; then
echo "::warning::State file present but slug missing; nothing to clean up."
exit 0
fi
echo "Deleting orphan tenant: $slug"
# Verify HTTP 2xx instead of `>/dev/null || true` swallowing
# failures. A 5xx or timeout previously looked identical to
# success, leaving the tenant alive for up to ~45 min until
# sweep-stale-e2e-orgs caught it. Surface failures as
# workflow warnings naming the slug. Don't `exit 1` — a single
# cleanup miss shouldn't fail-flag the canvas test when the
# actual smoke check passed; the sweeper is the safety net.
# See molecule-controlplane#420.
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
# pollution of the captured status (lint-curl-status-capture.yml).
set +e
curl -sS -o /tmp/canvas-cleanup.out -w "%{http_code}" \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/canvas-cleanup.code
set -e
code=$(cat /tmp/canvas-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
echo "[teardown] deleted $slug (HTTP $code)"
else
echo "::warning::canvas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canvas-cleanup.out 2>/dev/null)"
fi
exit 0

View File

@ -1,192 +0,0 @@
name: E2E Staging External Runtime
# Ported from .github/workflows/e2e-staging-external.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Regression for the four/five workspaces.status=awaiting_agent transitions
# that silently failed in production for five days before migration 046
# extended the workspace_status enum (see
# workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql).
#
# Why this is its own workflow (not folded into e2e-staging-saas.yml):
# - The full-saas harness defaults to runtime=hermes, never exercises
# external-runtime. Adding an `external` parameter to that script
# would force every push to staging through both lifecycles in
# series, doubling the EC2 cold-start budget.
# - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER
# window, 90s default + sweep interval), which we wait through
# deliberately. Folding it into hermes would make the long path
# even longer.
# - It can run in parallel with the hermes E2E since both create
# fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs
# `e2e-...`).
#
# Triggers:
# - Push to staging when any source affecting external runtime,
# hibernation, or the migration set changes.
# - PR review for the same set.
# - Manual workflow_dispatch.
# - Daily cron at 07:30 UTC (catches drift on quiet days; staggered
# 30 min after e2e-staging-saas.yml's 07:00 UTC cron).
#
# Concurrency: serialized so two staging pushes don't fight for the
# same EC2 quota window. cancel-in-progress=false so a half-rolled
# tenant always finishes its teardown.
on:
push:
branches: [main]
paths:
- 'workspace-server/internal/handlers/workspace.go'
- 'workspace-server/internal/handlers/registry.go'
- 'workspace-server/internal/handlers/workspace_restart.go'
- 'workspace-server/internal/registry/healthsweep.go'
- 'workspace-server/internal/registry/liveness.go'
- 'workspace-server/migrations/**'
- 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
- 'tests/e2e/test_staging_external_runtime.sh'
- '.gitea/workflows/e2e-staging-external.yml'
pull_request:
branches: [main]
paths:
- 'workspace-server/internal/handlers/workspace.go'
- 'workspace-server/internal/handlers/registry.go'
- 'workspace-server/internal/handlers/workspace_restart.go'
- 'workspace-server/internal/registry/healthsweep.go'
- 'workspace-server/internal/registry/liveness.go'
- 'workspace-server/migrations/**'
- 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
- 'tests/e2e/test_staging_external_runtime.sh'
- '.gitea/workflows/e2e-staging-external.yml'
schedule:
- cron: '30 7 * * *'
concurrency:
group: e2e-staging-external
cancel-in-progress: false
permissions:
contents: read
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
e2e-staging-external:
name: E2E Staging External Runtime
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 25
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
# 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify admin token present
run: |
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
# Schedule + push triggers must hard-fail when the token is
# missing — silent skip would mask infra rot. Manual dispatch
# gets the same hard-fail; an operator running this on a fork
# without secrets configured needs to know up-front.
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
exit 2
fi
echo "Admin token present ✓"
- name: CP staging health preflight
run: |
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
if [ "$code" != "200" ]; then
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
exit 1
fi
echo "Staging CP healthy ✓"
- name: Run external-runtime E2E
id: e2e
run: bash tests/e2e/test_staging_external_runtime.sh
# Mirror the e2e-staging-saas.yml safety net: if the runner is
# cancelled (e.g. concurrent staging push), the test script's
# EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to
# *this* run id.
- name: Teardown safety net (runs on cancel/failure)
if: always()
env:
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
run: |
set +e
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
| python3 -c "
import json, sys, os, datetime
run_id = os.environ.get('GITHUB_RUN_ID', '')
d = json.load(sys.stdin)
# Scope STRICTLY to this run id (e2e-ext-YYYYMMDD-<runid>-...)
# so concurrent runs and unrelated dev probes are not touched.
# Sweep today AND yesterday so a midnight-crossing run still
# cleans up its own slug.
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
if not run_id:
# Without a run id we cannot scope safely; bail rather
# than risk deleting unrelated tenants.
sys.exit(0)
prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates)
for o in d.get('orgs', []):
s = o.get('slug', '')
if s.startswith(prefixes) and o.get('status') != 'purged':
print(s)
" 2>/dev/null)
if [ -n "$orgs" ]; then
echo "Safety-net sweep: deleting leftover orgs:"
echo "$orgs"
# Per-slug verified DELETE — see molecule-controlplane#420.
# `>/dev/null 2>&1` previously hid every failure; surface
# non-2xx as workflow warnings so the run page names what
# leaked. Sweeper catches the rest within ~45 min.
leaks=()
for slug in $orgs; do
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
# pollution of the captured status (lint-curl-status-capture.yml).
set +e
curl -sS -o /tmp/external-cleanup.out -w "%{http_code}" \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/external-cleanup.code
set -e
code=$(cat /tmp/external-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
echo "[teardown] deleted $slug (HTTP $code)"
else
echo "::warning::external teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/external-cleanup.out 2>/dev/null)"
leaks+=("$slug")
fi
done
if [ ${#leaks[@]} -gt 0 ]; then
echo "::warning::external teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
fi
else
echo "Safety-net sweep: no leftover orgs to clean."
fi

View File

@ -1,254 +0,0 @@
name: E2E Staging SaaS (full lifecycle)
# Ported from .github/workflows/e2e-staging-saas.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Dedicated workflow that provisions a fresh staging org per run, exercises
# the full workspace lifecycle (register → heartbeat → A2A → delegation →
# HMA memory → activity → peers), then tears down and asserts leak-free.
#
# Why a separate workflow (not folded into ci.yml):
# - The run takes ~25-35 min (EC2 boot + cloudflared DNS + provision sweeps +
# agent bootstrap), way too slow for every PR.
# - Needs its own concurrency group so two pushes don't fight over the
# same staging org slug prefix.
# - Has its own required secrets (session cookie, admin token) that most
# PRs don't need to read.
#
# Triggers:
# - Push to main (regression guard)
# - workflow_dispatch (manual re-run from UI)
# - Nightly cron (catches drift even when no pushes land)
# - Changes to any provisioning-critical file under PR review (opt-in
# via the same paths watcher that e2e-api.yml uses)
on:
# Trunk-based (Phase 3 of internal#81): main is the only branch.
# Previously this fired on staging push too because staging was a
# superset of main and ran the gate ahead of auto-promote; with no
# staging branch, main is where E2E gates the deploy.
push:
branches: [main]
paths:
- 'workspace-server/internal/handlers/registry.go'
- 'workspace-server/internal/handlers/workspace_provision.go'
- 'workspace-server/internal/handlers/a2a_proxy.go'
- 'workspace-server/internal/middleware/**'
- 'workspace-server/internal/provisioner/**'
- 'tests/e2e/test_staging_full_saas.sh'
- '.gitea/workflows/e2e-staging-saas.yml'
pull_request:
branches: [main]
paths:
- 'workspace-server/internal/handlers/registry.go'
- 'workspace-server/internal/handlers/workspace_provision.go'
- 'workspace-server/internal/handlers/a2a_proxy.go'
- 'workspace-server/internal/middleware/**'
- 'workspace-server/internal/provisioner/**'
- 'tests/e2e/test_staging_full_saas.sh'
- '.gitea/workflows/e2e-staging-saas.yml'
schedule:
# 07:00 UTC every day — catches AMI drift, WorkOS cert rotation,
# Cloudflare API regressions, etc. even on quiet days.
- cron: '0 7 * * *'
# Serialize: staging has a finite per-hour org creation quota. Two pushes
# landing in quick succession should queue, not race. `cancel-in-progress:
# false` mirrors e2e-api.yml — GitHub would otherwise cancel the running
# teardown step and leave orphan EC2s.
concurrency:
group: e2e-staging-saas
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
e2e-staging-saas:
name: E2E Staging SaaS
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 45
permissions:
contents: read
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
# Single admin-bearer secret drives provision + tenant-token
# retrieval + teardown. Configure in
# Settings → Secrets and variables → Actions → Repository secrets.
# 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
# MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched
# from hermes+OpenAI default after #2578 (the staging OpenAI key
# account went over quota and stayed dead for 36+ hours, taking
# the full-lifecycle E2E red on every provisioning-critical push).
# claude-code template's `minimax` provider routes
# ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads
# MINIMAX_API_KEY at boot — separate billing account so an
# OpenAI quota collapse no longer wedges the gate. Mirrors the
# staging-smoke.yml + continuous-synth-e2e.yml migrations.
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
# Direct-Anthropic alternative for operators who don't want to
# set up a MiniMax account (priority below MiniMax — first
# non-empty wins in test_staging_full_saas.sh's secrets-injection
# block). See #2578 PR comment for the rationale.
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
# OpenAI fallback — kept wired so an operator-dispatched run with
# E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still
# exercise the OpenAI path.
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
# Pin the model when running on the default claude-code path —
# the per-runtime default ("sonnet") routes to direct Anthropic
# and defeats the cost saving. Operators can override via the
# workflow_dispatch flow (no input wired here yet — runtime
# override is enough for ad-hoc).
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }}
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify admin token present
run: |
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
exit 2
fi
echo "Admin token present ✓"
- name: Verify LLM key present
run: |
# Per-runtime key check — claude-code uses MiniMax; hermes /
# langgraph (operator-dispatched only) use OpenAI. Hard-fail
# rather than soft-skip per #2578's lesson — empty key
# silently falls through to the wrong SECRETS_JSON branch and
# produces a confusing auth error 5 min later instead of the
# clean "secret missing" message at the top.
case "${E2E_RUNTIME}" in
claude-code)
# Either MiniMax OR direct-Anthropic works — first
# non-empty wins in the test script's secrets-injection
# priority chain.
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
required_secret_value="${E2E_MINIMAX_API_KEY}"
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
required_secret_value="${E2E_ANTHROPIC_API_KEY}"
else
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
required_secret_value=""
fi
;;
langgraph|hermes)
required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
required_secret_value="${E2E_OPENAI_API_KEY:-}"
;;
*)
echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
required_secret_name=""
required_secret_value="present"
;;
esac
if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — workspaces will fail at boot with 'No provider API key found'"
exit 2
fi
echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
- name: CP staging health preflight
run: |
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
if [ "$code" != "200" ]; then
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
exit 1
fi
echo "Staging CP healthy ✓"
- name: Run full-lifecycle E2E
id: e2e
run: bash tests/e2e/test_staging_full_saas.sh
# Belt-and-braces teardown: the test script itself installs a trap
# for EXIT/INT/TERM, but if the GH runner itself is cancelled (e.g.
# someone pushes a new commit and workflow concurrency is set to
# cancel), the trap may not fire. This `always()` step runs even on
# cancellation and attempts the delete a second time. The admin
# DELETE endpoint is idempotent so double-invoking is safe.
- name: Teardown safety net (runs on cancel/failure)
if: always()
env:
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
run: |
# Best-effort: find any e2e-YYYYMMDD-* orgs matching this run and
# nuke them. Catches the case where the script died before
# exporting its slug.
set +e
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
| python3 -c "
import json, sys, os, datetime
run_id = os.environ.get('GITHUB_RUN_ID', '')
d = json.load(sys.stdin)
# ONLY sweep slugs from *this* CI run. Previously the filter was
# f'e2e-{today}-' which stomped on parallel CI runs AND any manual
# E2E probes a dev was running against staging (incident 2026-04-21
# 15:02Z: this workflow's safety net deleted an unrelated manual
# run's tenant 1s after it hit 'running').
# Sweep both today AND yesterday's UTC dates so a run that crosses
# midnight still matches its own slug — see the 2026-04-26→27
# canvas-safety-net incident for the same bug class.
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
if run_id:
prefixes = tuple(f'e2e-{d}-{run_id}-' for d in dates)
else:
prefixes = tuple(f'e2e-{d}-' for d in dates)
candidates = [o['slug'] for o in d.get('orgs', [])
if any(o.get('slug','').startswith(p) for p in prefixes)
and o.get('instance_status') not in ('purged',)]
print('\n'.join(candidates))
" 2>/dev/null)
# Per-slug verified DELETE (was `>/dev/null || true` — see
# molecule-controlplane#420). Surface non-2xx as a workflow
# warning naming the leaked slug; don't exit 1 (sweeper is
# the safety net within ~45 min).
leaks=()
for slug in $orgs; do
echo "Safety-net teardown: $slug"
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
# pollution of the captured status (lint-curl-status-capture.yml).
set +e
curl -sS -o /tmp/saas-cleanup.out -w "%{http_code}" \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/saas-cleanup.code
set -e
code=$(cat /tmp/saas-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
echo "[teardown] deleted $slug (HTTP $code)"
else
echo "::warning::saas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/saas-cleanup.out 2>/dev/null)"
leaks+=("$slug")
fi
done
if [ ${#leaks[@]} -gt 0 ]; then
echo "::warning::saas teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
fi
exit 0

View File

@ -1,166 +0,0 @@
name: E2E Staging Sanity (leak-detection self-check)
# Ported from .github/workflows/e2e-staging-sanity.yml on 2026-05-11 per
# RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - Dropped `workflow_dispatch:` (Gitea 1.22.6 finicky on bare dispatch).
# - `actions/github-script@v9` issue-open block replaced with curl
# calls to the Gitea REST API (/api/v1/repos/.../issues|comments).
# - Workflow-level env.GITHUB_SERVER_URL set.
# - `continue-on-error: true` on the job (RFC §1 contract).
#
# Periodic assertion that the teardown safety nets in e2e-staging-saas
# and staging-smoke (formerly canary-staging) actually work. Runs the
# E2E harness with E2E_INTENTIONAL_FAILURE=1, which poisons the tenant
# admin token after the org is provisioned. The workspace-provision
# step then fails, the script exits non-zero, and the EXIT trap +
# workflow always()-step must still tear down cleanly.
on:
schedule:
- cron: '0 6 * * 1'
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
concurrency:
group: e2e-staging-sanity
cancel-in-progress: false
permissions:
issues: write
contents: read
jobs:
sanity:
name: Intentional-failure teardown sanity
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 20
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
# 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
E2E_MODE: smoke
E2E_RUNTIME: hermes
E2E_RUN_ID: "sanity-${{ github.run_id }}"
E2E_INTENTIONAL_FAILURE: "1"
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify admin token present
run: |
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
exit 2
fi
# Inverted assertion: the run MUST fail. If it passes, the
# E2E_INTENTIONAL_FAILURE path is broken.
- name: Run harness — expecting exit !=0
id: harness
run: |
set +e
bash tests/e2e/test_staging_full_saas.sh
rc=$?
echo "harness_rc=$rc" >> "$GITHUB_OUTPUT"
if [ "$rc" = "1" ]; then
echo "OK Harness failed as expected (rc=1); teardown trap ran, leak-check passed"
exit 0
elif [ "$rc" = "0" ]; then
echo "::error::Harness succeeded under E2E_INTENTIONAL_FAILURE=1 — the poisoning path is broken"
exit 1
elif [ "$rc" = "4" ]; then
echo "::error::LEAK DETECTED (rc=4) — teardown failed to clean up the org. Safety net broken."
exit 4
else
echo "::error::Unexpected rc=$rc — neither clean-failure nor leak. Investigate harness."
exit 1
fi
- name: Open issue if safety net is broken (Gitea API)
if: failure()
env:
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
RUN_ID: ${{ github.run_id }}
run: |
set -euo pipefail
API="${SERVER_URL%/}/api/v1"
TITLE="E2E teardown safety net broken"
RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
BODY_JSON=$(jq -nc --arg t "$TITLE" --arg run "$RUN_URL" '
{title: $t,
body: ("The weekly sanity run (E2E_INTENTIONAL_FAILURE=1) did not exit as expected. This means one of:\n - poisoning did not actually cause failure (test harness regression), OR\n - teardown left an orphan org (leak detection caught a real bug)\n\nRun: " + $run + "\n\nThis is higher priority than a canary failure — the whole E2E safety net cannot be trusted until this is resolved.")}')
EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
"${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
| jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
if [ -n "$EXISTING" ]; then
curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues/${EXISTING}/comments" \
-d "$(jq -nc --arg run "$RUN_URL" '{body: ("Still broken. " + $run)}')" >/dev/null
echo "Commented on existing issue #${EXISTING}"
else
curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues" -d "$BODY_JSON" >/dev/null
echo "Filed new issue"
fi
# Belt-and-braces: if teardown left anything behind, nuke it here
# so we don't bleed staging quota.
- name: Teardown safety net
if: always()
env:
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
run: |
set +e
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
| python3 -c "
import json, sys
d = json.load(sys.stdin)
today = __import__('datetime').date.today().strftime('%Y%m%d')
# Match both the new e2e-smoke- prefix (post-2026-05-11 rename)
# and the legacy e2e-canary- prefix for one rollout cycle so
# any in-flight org provisioned under the old prefix on an
# older runner checkout still gets cleaned up. Remove the
# canary fallback after one week of no-old-prefix observations.
prefixes = (f'e2e-smoke-{today}-sanity-', f'e2e-canary-{today}-sanity-')
candidates = [o['slug'] for o in d.get('orgs', [])
if any(o.get('slug','').startswith(p) for p in prefixes)
and o.get('status') not in ('purged',)]
print('\n'.join(candidates))
" 2>/dev/null)
leaks=()
for slug in $orgs; do
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
# pollution of the captured status (lint-curl-status-capture.yml).
set +e
curl -sS -o /tmp/sanity-cleanup.out -w "%{http_code}" \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/sanity-cleanup.code
set -e
code=$(cat /tmp/sanity-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
echo "[teardown] deleted $slug (HTTP $code)"
else
echo "::warning::sanity teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/sanity-cleanup.out 2>/dev/null)"
leaks+=("$slug")
fi
done
if [ ${#leaks[@]} -gt 0 ]; then
echo "::warning::sanity teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
fi
exit 0

View File

@ -1,91 +0,0 @@
# gate-check-v3 — automated PR gate detector
#
# Runs on every open PR (push/synchronize) and hourly via cron.
# Posts a structured [gate-check-v3] STATUS: comment on the PR.
#
# Inputs:
# PR_NUMBER — set via ${{ github.event.pull_request.number }} from the trigger
# POST_COMMENT — "true" to post/update comment on PR
#
# Gating logic (MVP signals 1,2,3,6):
# 1. Author-aware agent-tag comment scan
# 2. REQUEST_CHANGES reviews state machine
# 3. Staleness detection (SOP-12: review.commit_id != PR.head_sha + >1 working day)
# 6. CI required-checks awareness
#
# Exit code: 0=CLEAR, 1=BLOCKED, 2=ERROR
name: gate-check-v3
on:
pull_request_target:
types: [opened, edited, synchronize, reopened]
schedule:
# Hourly: refresh all open PRs
- cron: '8 * * * *'
workflow_dispatch:
inputs:
pr_number:
description: 'PR number to check (omit for all open PRs)'
required: false
type: string
post_comment:
description: 'Post comment on PR'
required: false
type: string
default: 'true'
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
gate-check:
runs-on: ubuntu-latest
continue-on-error: true # Never block on our own detector failing
steps:
- name: Check out base branch (for the script)
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ github.event.pull_request.base.sha || github.ref_name }}
- name: Run gate-check-v3 (single PR mode)
if: github.event_name == 'pull_request_target' || github.event.inputs.pr_number != ''
env:
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number || github.event.inputs.pr_number }}
POST_COMMENT: ${{ github.event.inputs.post_comment || 'true' }}
run: |
set -euo pipefail
python3 tools/gate-check-v3/gate_check.py \
--repo "${{ github.repository }}" \
--pr "$PR_NUMBER" \
$([ "$POST_COMMENT" = "true" ] && echo "--post-comment")
echo "verdict=$?" >> "$GITHUB_OUTPUT"
- name: Run gate-check-v3 (all open PRs — cron mode)
if: github.event_name == 'schedule'
env:
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
# Fetch all open PRs and run gate-check on each
pr_numbers=$(python3 -c "
import urllib.request, json, os
token = os.environ['GITEA_TOKEN']
req = urllib.request.Request(
'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
)
with urllib.request.urlopen(req) as r:
prs = json.loads(r.read())
for pr in prs:
print(pr['number'])
")
for pr in $pr_numbers; do
echo "Checking PR #$pr..."
python3 tools/gate-check-v3/gate_check.py \
--repo "${{ github.repository }}" \
--pr "$pr" \
--post-comment \
|| true
done

View File

@ -1,282 +0,0 @@
name: Handlers Postgres Integration
# Ported from .github/workflows/handlers-postgres-integration.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Real-Postgres integration tests for workspace-server/internal/handlers/.
# Triggered on every PR/push that touches the handlers package.
#
# Why this workflow exists
# ------------------------
# Strict-sqlmock unit tests pin which SQL statements fire — they're fast
# and let us iterate without a DB. But sqlmock CANNOT detect bugs that
# depend on the row state AFTER the SQL runs. The result_preview-lost
# bug shipped to staging in PR #2854 because every unit test was
# satisfied with "an UPDATE statement fired" — none verified the row's
# preview field actually landed. The local-postgres E2E that retrofit
# self-review caught it took 2 minutes to set up and would have caught
# the bug at PR-time.
#
# Why this workflow does NOT use `services: postgres:` (Class B fix)
# ------------------------------------------------------------------
# Our act_runner config has `container.network: host` (operator host
# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
# the job container AND every service container. With host-net, two
# concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
# second postgres FATALs with `could not create any TCP/IP sockets:
# Address in use`, and Docker auto-removes it (act_runner sets
# AutoRemove:true on service containers). By the time the migrations
# step runs `psql`, the postgres container is gone, hence
# `Connection refused` then `failed to remove container: No such
# container` at cleanup time.
#
# Per-job `container.network` override is silently ignored by
# act_runner — `--network and --net in the options will be ignored.`
# appears in the runner log. Documented constraint.
#
# So we sidestep `services:` entirely. The job container still uses
# host-net (inherited from runner config; required for cache server
# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
# postgres on the existing `molecule-core-net` bridge with a
# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
# read its bridge IP via `docker inspect`. A host-net job container
# can reach a bridge-net container directly via the bridge IP (verified
# manually on operator host 2026-05-08).
#
# Trade-offs vs. the original `services:` shape:
# + No host-port collision; N parallel runs share the bridge cleanly
# + `if: always()` cleanup runs even on test-step failure
# - One more step in the workflow (+~3 lines)
# - Requires `molecule-core-net` to exist on the operator host
# (it does; declared in docker-compose.yml + docker-compose.infra.yml)
#
# Class B Hongming-owned CICD red sweep, 2026-05-08.
#
# Cost: ~30s job (postgres pull from cache + go build + 4 tests).
on:
push:
branches: [main, staging]
pull_request:
branches: [main, staging]
concurrency:
group: handlers-pg-integ-${{ github.event.pull_request.head.sha || github.sha }}
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
detect-changes:
name: detect-changes
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
outputs:
handlers: ${{ steps.filter.outputs.handlers }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
- id: filter
# Inline replacement for dorny/paths-filter — see e2e-api.yml.
run: |
BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
BASE="${{ github.event.pull_request.base.sha }}"
fi
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
echo "handlers=true" >> "$GITHUB_OUTPUT"
exit 0
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
echo "handlers=true" >> "$GITHUB_OUTPUT"
exit 0
fi
CHANGED=$(git diff --name-only "$BASE" HEAD)
if echo "$CHANGED" | grep -qE '^(workspace-server/internal/handlers/|workspace-server/internal/wsauth/|workspace-server/migrations/|\.gitea/workflows/handlers-postgres-integration\.yml$)'; then
echo "handlers=true" >> "$GITHUB_OUTPUT"
else
echo "handlers=false" >> "$GITHUB_OUTPUT"
fi
# Single-job-with-per-step-if pattern: always runs to satisfy the
# required-check name on branch protection; real work gates on the
# paths filter. See ci.yml's Platform (Go) for the same shape.
integration:
name: Handlers Postgres Integration
needs: detect-changes
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
env:
# Unique name per run so concurrent jobs don't collide on the
# bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
# workflow_dispatch reruns of the same run_id.
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
# Bridge network already exists on the operator host (declared
# in docker-compose.yml + docker-compose.infra.yml).
PG_NETWORK: molecule-core-net
defaults:
run:
working-directory: workspace-server
steps:
- if: needs.detect-changes.outputs.handlers != 'true'
working-directory: .
run: echo "No handlers/migrations changes — skipping; this job always runs to satisfy the required-check name."
- if: needs.detect-changes.outputs.handlers == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: needs.detect-changes.outputs.handlers == 'true'
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version: 'stable'
- if: needs.detect-changes.outputs.handlers == 'true'
name: Start sibling Postgres on bridge network
working-directory: .
run: |
# Sanity: the bridge network must exist on the operator host.
# Hard-fail loud if it doesn't — easier to spot than a silent
# auto-create that diverges from the rest of the stack.
if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
exit 1
fi
# If a stale container with the same name exists (rerun on
# the same run_id), wipe it first.
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
docker run -d \
--name "${PG_NAME}" \
--network "${PG_NETWORK}" \
--health-cmd "pg_isready -U postgres" \
--health-interval 5s \
--health-timeout 5s \
--health-retries 10 \
-e POSTGRES_PASSWORD=test \
-e POSTGRES_DB=molecule \
postgres:15-alpine >/dev/null
# Read back the bridge IP. Always present immediately after
# `docker run -d` for bridge networks.
PG_HOST=$(docker inspect "${PG_NAME}" \
--format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
if [ -z "${PG_HOST}" ]; then
echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
docker logs "${PG_NAME}" || true
exit 1
fi
echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
echo "Started ${PG_NAME} at ${PG_HOST}:5432"
- if: needs.detect-changes.outputs.handlers == 'true'
name: Apply migrations to Postgres service
env:
PGPASSWORD: test
run: |
# Wait for postgres to actually accept connections. Docker's
# health-cmd handles container-side readiness, but the wire
# to the bridge IP is best-tested with pg_isready directly.
for i in {1..15}; do
if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
done
# Apply every .up.sql in lexicographic order with
# ON_ERROR_STOP=0 — failing migrations are SKIPPED rather than
# blocking the suite. This handles the current schema state
# where a few historical migrations (e.g. 017_memories_fts_*)
# depend on tables that were later renamed/dropped and so
# cannot replay from scratch. The migrations that DO succeed
# land their tables, which is sufficient for the integration
# tests in handlers/.
#
# Why not maintain a curated allowlist: every new migration
# touching a handlers/-tested table would have to update this
# workflow. With apply-all-or-skip, a future migration that
# adds a column to delegations runs automatically (its base
# table 049_delegations.up.sql already succeeded above it in
# the order). Operators only need to revisit this if the
# migration chain becomes legitimately replayable end-to-end.
#
# Per-migration result is logged so a failed migration that
# SHOULD have been replayable surfaces in the CI log instead
# of silently failing.
# Apply both *.sql (legacy, lives next to its module) and
# *.up.sql (newer up/down convention) in a single
# lexicographically-sorted pass. Excluding *.down.sql so the
# newest-naming-convention pairs don't undo themselves mid-run.
# Pre-#149-followup this loop only globbed *.up.sql, which
# silently skipped 001_workspaces.sql + 009_activity_logs.sql
# — fine while no integration test depended on those tables,
# not fine once a cross-table atomicity test came in.
set +e
for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
-f "$migration" >/dev/null 2>&1; then
echo "✓ $(basename "$migration")"
else
echo "⊘ $(basename "$migration") (skipped — see comment in workflow)"
fi
done
set -e
# Sanity: the delegations + workspaces + activity_logs tables
# MUST exist for the integration tests to be meaningful. Hard-
# fail if any didn't land — that would be a real regression we
# want loud.
for tbl in delegations workspaces activity_logs pending_uploads; do
if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
-c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
| grep -q 1; then
echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
exit 1
fi
echo "✓ $tbl table present"
done
- if: needs.detect-changes.outputs.handlers == 'true'
name: Run integration tests
run: |
# INTEGRATION_DB_URL is exported by the start-postgres step;
# points at the per-run bridge IP, not 127.0.0.1, so concurrent
# workflow runs don't fight over a host-net 5432 port.
go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
- if: failure() && needs.detect-changes.outputs.handlers == 'true'
name: Diagnostic dump on failure
env:
PGPASSWORD: test
run: |
echo "::group::postgres container status"
docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
docker logs "${PG_NAME}" 2>&1 | tail -50 || true
echo "::endgroup::"
echo "::group::delegations table state"
psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
echo "::endgroup::"
- if: always() && needs.detect-changes.outputs.handlers == 'true'
name: Stop sibling Postgres
working-directory: .
run: |
# always() so containers don't leak when migrations or tests
# fail. The cleanup is best-effort: if the container is
# already gone (e.g. concurrent rerun race), don't fail the job.
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
echo "Cleaned up ${PG_NAME}"

View File

@ -1,296 +0,0 @@
name: Harness Replays
# Ported from .github/workflows/harness-replays.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Boots tests/harness (production-shape compose topology with TenantGuard,
# /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
# every replay under tests/harness/replays/. Fails the PR if any replay
# fails.
#
# Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
# a public route in router.go but forgot to add it to TenantGuard's
# allowlist. The handler-level test in buildinfo_test.go constructed a
# minimal gin engine without TenantGuard — green. The harness's
# buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
# inject X-Molecule-Org-Id, so the curl path is identical to production's
# redeploy verifier), but no one ran the harness pre-merge. The bug
# shipped; the redeploy verifier silently soft-warned every tenant as
# "unreachable" for ~1 day before being noticed.
#
# This gate makes "did you actually run the harness?" a CI invariant
# instead of a memory-discipline thing.
#
# Trigger model — match e2e-api.yml: always FIRES on push/pull_request
# to staging+main, real work is gated per-step on detect-changes output.
# One job → one check run → branch-protection-clean (the SKIPPED-in-set
# trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
on:
push:
branches: [main, staging]
paths:
- 'workspace-server/**'
- 'canvas/**'
- 'tests/harness/**'
- '.gitea/workflows/harness-replays.yml'
pull_request:
branches: [main, staging]
paths:
- 'workspace-server/**'
- 'canvas/**'
- 'tests/harness/**'
- '.gitea/workflows/harness-replays.yml'
concurrency:
# Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
# cancellation deadlock — see e2e-api.yml's concurrency block for
# the 2026-04-28 incident that codified this pattern.
group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
outputs:
run: ${{ steps.decide.outputs.run }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Fetch base branch tip for diff
continue-on-error: true
run: |
# With the default fetch-depth: 1, actions/checkout only fetches the
# PR head commit. The base commit is NOT in the local history, so
# `git diff "$BASE" "$GITHUB_SHA"` fails. Fetch the base branch at
# depth 1 — the base commit is the immediate parent of the PR head
# on the base branch, so depth=1 is sufficient.
#
# Network: Gitea Actions runner (5.78.80.188) cannot reach the git
# remote over HTTPS (confirmed: git fetch times out at ~15s). The runner
# is on the same host as Gitea, but the container network namespace
# cannot reach the Gitea HTTPS endpoint.
#
# Fallback: if the base commit does not exist locally, skip the diff
# and set run=true (always run harness). This is safe: PRs where the
# base is unavailable still run the harness (correct), PRs where the
# base IS available get the correct path-based diff.
#
# Timeout: 20s. If the fetch completes, great. If it times out, the
# step exits non-zero and we fall through to run=true.
if timeout 20 git fetch origin "${{ github.event.pull_request.base.ref }}" --depth=1; then
echo "::notice::base branch fetched successfully"
else
echo "::warning::git fetch origin ${{ github.event.pull_request.base.ref }} --depth=1 timed out"
echo "::warning::Skipping diff — detect-changes will run the harness unconditionally."
fi
- id: decide
continue-on-error: true
run: |
# workflow_dispatch: always run (manual trigger)
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "run=true" >> "$GITHUB_OUTPUT"
echo "debug=manual-trigger" >> "$GITHUB_OUTPUT"
exit 0
fi
# Determine the base commit to diff against.
# For pull_request: use base.sha (the merge-base with main/staging).
# For push: use github.event.before (the previous tip of the branch).
# Fallback for new branches (all-zeros SHA): run everything.
if [ "${{ github.event_name }}" = "pull_request" ] && \
[ -n "${{ github.event.pull_request.base.sha }}" ]; then
BASE="${{ github.event.pull_request.base.sha }}"
elif [ -n "${{ github.event.before }}" ] && \
! echo "${{ github.event.before }}" | grep -qE '^0+$'; then
BASE="${{ github.event.before }}"
else
# New branch or github.event.before unavailable — run everything.
echo "run=true" >> "$GITHUB_OUTPUT"
echo "debug=new-branch-fallback" >> "$GITHUB_OUTPUT"
exit 0
fi
# GitHub Actions and Gitea Actions both expose github.sha for HEAD.
# git diff exits 1 when BASE is not in local history (e.g. shallow
# checkout where the base commit was never fetched). Capture and
# swallow that exit code — the empty diff means "run everything".
# The runner network cannot reach the git remote (confirmed: git fetch
# times out at ~15s), so a failed fetch is expected and we always fall
# through to the unconditional run=true below.
DIFF=$(git diff --name-only "$BASE" "${{ github.sha }}" 2>/dev/null) || true
echo "debug=diff-base=$BASE diff-files=$DIFF" >> "$GITHUB_OUTPUT"
if echo "$DIFF" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.gitea/workflows/harness-replays\.yml$'; then
echo "run=true" >> "$GITHUB_OUTPUT"
else
echo "run=false" >> "$GITHUB_OUTPUT"
fi
# ONE job that always runs. Real work is gated per-step on
# detect-changes.outputs.run so an unrelated PR (e.g. doc-only
# change to molecule-controlplane wired here later) emits the
# required check without spending CI cycles. Single-job pattern
# matches e2e-api.yml — see that workflow's comment for why a
# job-level `if: false` would block branch protection via the
# SKIPPED-in-set bug.
harness-replays:
needs: detect-changes
name: Harness Replays
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 30
steps:
- name: No-op pass (paths filter excluded this commit)
if: needs.detect-changes.outputs.run != 'true'
run: |
echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
echo "::notice::Debug: ${{ needs.detect-changes.outputs.debug }}"
- if: needs.detect-changes.outputs.run == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
# Log what files were detected so future failures include the diff.
- name: Log detected changes
if: needs.detect-changes.outputs.run == 'true'
run: |
echo "::notice::detect-changes debug: ${{ needs.detect-changes.outputs.debug }}"
# github-app-auth sibling-checkout removed 2026-05-07 (#157):
# the plugin was dropped + Dockerfile.tenant no longer COPYs it.
# Pre-clone manifest deps before docker compose builds the tenant
# image (Task #173 followup — same pattern as
# publish-workspace-server-image.yml's "Pre-clone manifest deps"
# step).
#
# Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha
# and tenant-beta from workspace-server/Dockerfile.tenant with
# context=../.. (repo root). That Dockerfile expects
# .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins}
# to be present at build context root (post-#173 it COPYs from there
# instead of running an in-image clone — the in-image clone failed
# with "could not read Username for https://git.moleculesai.app"
# because there's no auth path inside the build sandbox).
#
# Without this step harness-replays fails before any replay runs,
# with `failed to calculate checksum of ref ...
# "/.tenant-bundle-deps/plugins": not found`. Caught by run #892
# (main, 2026-05-07T20:28:53Z) and run #964 (staging — same
# symptom, different root cause: staging still has the in-image
# clone path, hits the auth error directly).
#
# 2026-05-08 sub-finding (#192): the clone step ALSO fails when
# any referenced workspace-template repo is private and the
# AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read
# access. Root cause: 5 of 9 workspace-template repos
# (openclaw, codex, crewai, deepagents, gemini-cli) had been
# marked private with no team grant. Resolution: flipped them
# to public per `feedback_oss_first_repo_visibility_default`
# (the OSS surface should be public). Layer-3 (customer-private +
# marketplace third-party repos) tracked separately in
# internal#102.
#
# Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
# is the devops-engineer persona PAT, NOT the founder PAT (per
# `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
# embeds it as basic-auth for the duration of the clones and strips
# .git directories — the token never enters the resulting image.
- name: Pre-clone manifest deps
if: needs.detect-changes.outputs.run == 'true'
env:
MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
run: |
set -euo pipefail
if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
exit 1
fi
mkdir -p .tenant-bundle-deps
bash scripts/clone-manifest.sh \
manifest.json \
.tenant-bundle-deps/workspace-configs-templates \
.tenant-bundle-deps/org-templates \
.tenant-bundle-deps/plugins
# Sanity-check counts so a silent partial clone fails fast
# instead of producing a half-empty image.
ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
- name: Install Python deps for replays
# peer-discovery-404 (and future replays) eval Python against the
# running tenant — importing workspace/a2a_client.py pulls in
# httpx. tests/harness/requirements.txt holds just the HTTP-client
# surface to keep CI install fast (~3s) vs the full
# workspace/requirements.txt (~30s).
if: needs.detect-changes.outputs.run == 'true'
run: pip install -r tests/harness/requirements.txt
- name: Run all replays against the harness
# run-all-replays.sh: boot via up.sh → seed via seed.sh → run
# every replays/*.sh → tear down via down.sh on EXIT (trap).
# Non-zero exit on any replay failure.
#
# KEEP_UP=1: without this, the script's trap-on-EXIT tears
# down containers immediately on failure, leaving the dump
# step below with nothing to dump (verified on PR #2410's
# first run — tenant became unhealthy, trap fired, dump
# step saw empty containers). Keeping them up lets the
# failure path collect tenant/cp-stub/cf-proxy logs. The
# always-run "Force teardown" step does the actual cleanup.
if: needs.detect-changes.outputs.run == 'true'
working-directory: tests/harness
env:
KEEP_UP: "1"
run: ./run-all-replays.sh
- name: Dump compose logs on failure
# SECRETS_ENCRYPTION_KEY: docker compose validates the entire compose
# file even for read-only `logs` calls. up.sh generates a per-run key
# and exports it to its OWN shell — this step runs in a fresh shell
# that wouldn't see it, so without a placeholder the validate step
# errors before logs print (verified against PR #2492's first run:
# "required variable SECRETS_ENCRYPTION_KEY is missing a value").
# A placeholder is fine — we're only reading log streams, not booting.
if: failure() && needs.detect-changes.outputs.run == 'true'
working-directory: tests/harness
env:
SECRETS_ENCRYPTION_KEY: dump-logs-placeholder
run: |
echo "=== docker compose ps ==="
docker compose -f compose.yml ps || true
echo "=== tenant-alpha logs ==="
docker compose -f compose.yml logs tenant-alpha || true
echo "=== tenant-beta logs ==="
docker compose -f compose.yml logs tenant-beta || true
echo "=== cp-stub logs ==="
docker compose -f compose.yml logs cp-stub || true
echo "=== cf-proxy logs ==="
docker compose -f compose.yml logs cf-proxy || true
echo "=== postgres-alpha logs (last 100) ==="
docker compose -f compose.yml logs --tail 100 postgres-alpha || true
echo "=== postgres-beta logs (last 100) ==="
docker compose -f compose.yml logs --tail 100 postgres-beta || true
- name: Force teardown
# We pass KEEP_UP=1 to run-all-replays.sh so the dump step
# above sees real containers — that means we own teardown
# explicitly here. Always run.
if: always() && needs.detect-changes.outputs.run == 'true'
working-directory: tests/harness
run: ./down.sh || true

View File

@ -1,104 +0,0 @@
name: Lint curl status-code capture
# Ported from .github/workflows/lint-curl-status-capture.yml on 2026-05-11
# per RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - on.paths and the lint scanner target .gitea/workflows/**.yml (the
# active Gitea workflow directory) instead of .github/workflows/**.yml
# (which the rest of this sweep is emptying out).
# - Self-skip path updated to the .gitea/ version of this file.
# - Dropped `merge_group:` trigger.
# - Workflow-level env.GITHUB_SERVER_URL set per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on the job (RFC §1 contract).
#
# Pins the workflow-bash anti-pattern that produced "HTTP 000000" on the
# 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6:
#
# HTTP_CODE=$(curl ... -w '%{http_code}' ... || echo "000")
#
# When curl exits non-zero (connection reset -> 56, --fail-with-body 4xx/5xx
# -> 22), the `-w '%{http_code}'` already wrote a status to stdout — usually
# "000" for connection failures or the actual code for HTTP errors. The
# `|| echo "000"` then fires AND appends ANOTHER "000" to the captured
# stdout, producing values like "000000" or "409000" that fail string
# comparisons against "200" while looking superficially right.
#
# Same class of bug the synth-E2E §7c gate hit twice (PRs #2779/#2783 +
# #2797). Memory: feedback_curl_status_capture_pollution.md.
on:
pull_request:
paths: ['.gitea/workflows/**']
push:
branches: [main, staging]
paths: ['.gitea/workflows/**']
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
scan:
name: Scan workflows for curl status-capture pollution
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Find curl ... -w '%{http_code}' ... || echo "000" subshells
run: |
set -uo pipefail
# Multi-line aware: look for `$(curl ... -w '%{http_code}' ... || echo "000")`
# subshell where the entire command-substitution wraps a curl that
# ends with `|| echo "000"`. Must distinguish from the SAFE shape
# `$(cat tempfile 2>/dev/null || echo "000")` — `cat` with a missing
# tempfile produces empty stdout, no pollution.
python3 <<'PY'
import os, re, sys, glob
BAD_FILES = []
# Match the buggy substitution across newlines: $(curl ... -w '%{http_code}' ... || echo "000")
# The `\\n` is the bash line-continuation that lets curl flags span lines.
# We collapse continuation lines first, then look for the single-line bad pattern.
PATTERN = re.compile(
r'\$\(\s*curl\b[^)]*-w\s*[\'"]%\{http_code\}[\'"][^)]*\|\|\s*echo\s+"000"\s*\)',
re.DOTALL,
)
# Self-skip: this lint workflow contains the literal anti-pattern in
# its own docstring — that's intentional, not a bug.
SELF = ".gitea/workflows/lint-curl-status-capture.yml"
for f in sorted(glob.glob(".gitea/workflows/*.yml")):
if f == SELF:
continue
with open(f) as fh:
content = fh.read()
# Collapse bash line-continuations (\\\n + leading whitespace)
# into a single logical line so the regex can see the full
# curl invocation as one chunk.
flat = re.sub(r'\\\s*\n\s*', ' ', content)
for m in PATTERN.finditer(flat):
BAD_FILES.append((f, m.group(0)[:120]))
if not BAD_FILES:
print("OK No curl-status-capture pollution patterns detected")
sys.exit(0)
print(f"::error::Found {len(BAD_FILES)} curl-status-capture pollution site(s):")
for f, snippet in BAD_FILES:
print(f"::error file={f}::Curl status-capture pollution: '|| echo \"000\"' inside a $(curl ... -w '%{{http_code}}' ...) subshell. On non-2xx or connection failure, curl's -w writes a status, then exits non-zero, then the || echo appends another '000' — producing 'HTTP 000000' or '409000' that fails comparisons silently. Fix: route -w into a tempfile so the exit code can't pollute stdout. See memory feedback_curl_status_capture_pollution.md.")
print(f" matched: {snippet}...")
print()
print("Fix template:")
print(' set +e')
print(' curl ... -w \'%{http_code}\' >code.txt 2>/dev/null')
print(' set -e')
print(' HTTP_CODE=$(cat code.txt 2>/dev/null)')
print(' [ -z "$HTTP_CODE" ] && HTTP_CODE="000"')
sys.exit(1)
PY

View File

@ -1,94 +0,0 @@
# main-red-watchdog — hourly sentinel for post-merge CI red on `main`.
#
# RFC: hongming "main NEVER goes red" directive, Option C of the four-
# option ladder (B = auto-revert is explicitly rejected per
# `feedback_no_such_thing_as_flakes` + `feedback_fix_root_not_symptom`).
# Tracking issue: molecule-core#420.
#
# What it does:
# 1. GET branches/main → HEAD SHA
# 2. GET commits/{SHA}/status → combined status
# 3. If combined is `failure` (or any individual status is `failure`):
# open or PATCH an idempotent `[main-red] {repo}: {SHA[:10]}` issue
# with each failed context + target_url + description.
# 4. If combined is `success` and a prior `[main-red] ...` issue exists,
# close it with a "main returned to green at SHA ..." comment.
# 5. Emit a Loki-shaped JSON line via `logger -t main-red-watchdog` for
# `reference_obs_stack_phase1` ingestion via Vector.
#
# What it does NOT do:
# - Auto-revert anything. Option B is rejected by directive.
# - Mutate branch protection. (See AGENTS.md boundaries.)
# - Fail the workflow on red. The issue IS the alarm — failing the
# watchdog would create a silent-loop where a flake in the watchdog
# itself hides actual main-red signal. Exit 0 unless api() raises
# ApiError (transient Gitea outage → fail loudly per
# `feedback_api_helper_must_raise_not_return_dict`).
#
# Pattern source: molecule-controlplane `0adf2098`'s ci-required-drift.yml
# (just merged 2026-05-11). Same shape (cron + dispatch + sidecar Python +
# idempotent-by-title issue), simpler scope (1 source, not 3).
name: main-red-watchdog
# IMPORTANT — Gitea 1.22.6 parser quirk per
# `feedback_gitea_workflow_dispatch_inputs_unsupported`: do NOT add an
# `inputs:` block here. Gitea 1.22.6 rejects the whole workflow as
# "unknown on type" when `workflow_dispatch.inputs.X` is present. Revisit
# when Gitea ≥ 1.23 is fleet-wide.
on:
schedule:
# Hourly at :05 — task spec calls for "off-zero" (`5 * * * *`),
# offset from :17 (ci-required-drift) and :00 (peak cron load).
- cron: '5 * * * *'
workflow_dispatch:
# Read commit status + branch ref + issues; write issues (open/PATCH/close).
permissions:
contents: read
issues: write
# Workflow-scoped serialisation — two simultaneous runs would race on the
# `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
# POSTs can produce duplicates before the title search dedup wins.
concurrency:
group: main-red-watchdog
cancel-in-progress: false
jobs:
watchdog:
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Check out repo (script lives at .gitea/scripts/)
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python (stdlib only — no PyYAML needed here)
# The script uses stdlib urllib + json. No PyYAML required (CP's
# drift detector needs it for AST parsing; we don't). Pin to the
# same 3.12 hermetic interpreter CP uses so the test/runtime
# versions stay aligned across watchdog suites.
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
- name: Run main-red watchdog
env:
# GITEA_TOKEN reads commit status + writes issues. Falls back
# to the auto-injected GITHUB_TOKEN if the org-level secret
# isn't set (transitional repos), matching the same pattern
# used by deploy-pipeline.yml + ci-required-drift.yml.
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN || secrets.GITHUB_TOKEN }}
GITEA_HOST: git.moleculesai.app
REPO: ${{ github.repository }}
# Branch under watch. `main` per directive; staging not
# included here — staging green is a separate gate
# (`feedback_staging_e2e_merge_gate`).
WATCH_BRANCH: 'main'
# Issue label applied on file/open. `tier:high` exists in the
# molecule-core label set (verified 2026-05-11, label id 9).
# Rationale for high: main red blocks the promotion train and
# poisons every PR's auto-rebase base; treat as a fire even
# if intermittent.
RED_LABEL: 'tier:high'
run: python3 .gitea/scripts/main-red-watchdog.py

View File

@ -1,138 +0,0 @@
name: publish-canvas-image
# Ported from .github/workflows/publish-canvas-image.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
# - **Open question for review**: this workflow pushes the canvas
# image to `ghcr.io`. GHCR was retired during the 2026-05-06
# Gitea migration in favor of ECR (per staging-verify.yml header
# notes). The image may not be consumable post-migration. Two
# options for follow-up: (a) retarget to
# `153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas`,
# or (b) retire this workflow entirely and route canvas deploys
# via the operator-host build path. tier:low + continue-on-error
# means failed pushes do not block PRs.
#
# Builds and pushes the canvas Docker image to GHCR whenever a commit lands
# on main that touches canvas code. Previously canvas changes were visible in
# CI (npm run build passed) but the live container was never updated —
# operators had to manually run `docker compose build canvas` each time.
#
# Mirror of publish-platform-image.yml, adapted for the Next.js canvas layer.
# See that workflow for inline notes on macOS Keychain isolation and QEMU.
on:
push:
branches: [main]
paths:
# Only rebuild when canvas source changes — saves GHA minutes on
# platform-only / docs-only / MCP-only merges.
- 'canvas/**'
- '.gitea/workflows/publish-canvas-image.yml'
# NOTE (Gitea port): the original GitHub workflow had a
# `workflow_dispatch:` manual trigger for the
# non-canvas-merge-but-need-fresh-image scenario. Dropped in the
# Gitea port (1.22.6 parser-finicky). Manual rebuilds require
# pushing an empty commit to canvas/ or running the operator-host
# build directly.
permissions:
contents: read
packages: write # required to push to ghcr.io/${{ github.repository_owner }}/*
env:
IMAGE_NAME: ghcr.io/molecule-ai/canvas
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
build-and-push:
name: Build & push canvas image
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Log in to GHCR
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
# Health check: verify Docker daemon is accessible before attempting any
# build steps. This fails loudly at step 1 when the runner's docker.sock
# is inaccessible rather than silently continuing to the build step
# where docker build fails deep in ECR auth with a cryptic error.
- name: Verify Docker daemon access
run: |
set -euo pipefail
echo "::group::Docker daemon health check"
docker info 2>&1 | head -5 || {
echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
exit 1
}
echo "Docker daemon OK"
echo "::endgroup::"
- name: Compute tags
id: tags
shell: bash
run: |
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
- name: Resolve build args
id: build_args
# Priority: workflow_dispatch input > repo secret > hardcoded default.
# NEXT_PUBLIC_* env vars are baked into the JS bundle at build time by
# Next.js — they cannot be changed at runtime without a full rebuild.
# For local docker-compose deployments the defaults (localhost:8080)
# work as-is; production deployments should set CANVAS_PLATFORM_URL
# and CANVAS_WS_URL as repository secrets.
#
# Inputs are passed via env vars (not direct ${{ }} interpolation) to
# prevent shell injection from workflow_dispatch string inputs.
shell: bash
env:
INPUT_PLATFORM_URL: ${{ github.event.inputs.platform_url }}
SECRET_PLATFORM_URL: ${{ secrets.CANVAS_PLATFORM_URL }}
INPUT_WS_URL: ${{ github.event.inputs.ws_url }}
SECRET_WS_URL: ${{ secrets.CANVAS_WS_URL }}
run: |
PLATFORM_URL="${INPUT_PLATFORM_URL:-${SECRET_PLATFORM_URL:-http://localhost:8080}}"
WS_URL="${INPUT_WS_URL:-${SECRET_WS_URL:-ws://localhost:8080/ws}}"
echo "platform_url=${PLATFORM_URL}" >> "$GITHUB_OUTPUT"
echo "ws_url=${WS_URL}" >> "$GITHUB_OUTPUT"
- name: Build & push canvas image to GHCR
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
with:
context: ./canvas
file: ./canvas/Dockerfile
platforms: linux/amd64
push: true
build-args: |
NEXT_PUBLIC_PLATFORM_URL=${{ steps.build_args.outputs.platform_url }}
NEXT_PUBLIC_WS_URL=${{ steps.build_args.outputs.ws_url }}
tags: |
${{ env.IMAGE_NAME }}:latest
${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
labels: |
org.opencontainers.image.source=https://github.com/${{ github.repository }}
org.opencontainers.image.revision=${{ github.sha }}
org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow)

View File

@ -1,108 +0,0 @@
name: publish-runtime-autobump
# Auto-bump-on-workspace-edit half of the publish pipeline.
#
# Why this file exists (issue #351):
# Gitea Actions does not correctly disambiguate `paths:` from `tags:`
# when both are bundled under a single `on.push` key. The result is
# that tag pushes get filtered out and `publish-runtime.yml` never
# fires — `action_run` rows: 0. This was unnoticed pre-2026-05-11
# because PYPI_TOKEN was absent (publishes would have failed anyway).
#
# Split design:
# - publish-runtime.yml : on.push.tags only (the publisher)
# - publish-runtime-autobump.yml: on.push.branches+paths (this file — the version-bumper)
#
# This file computes the next version from PyPI's latest, pushes a
# `runtime-v$VERSION` tag, and exits. The tag push then triggers
# publish-runtime.yml via its tags-only trigger.
#
# Concurrency: shares the `publish-runtime` group with publish-runtime.yml
# so concurrent workspace pushes serialize at the bump step. Without
# this, two pushes minutes apart could both read PyPI latest=0.1.129
# and try to tag 0.1.130 simultaneously, only one of which would land.
on:
push:
branches:
- main
- staging
paths:
- "workspace/**"
permissions:
contents: write # required to push tags back
concurrency:
group: publish-runtime
cancel-in-progress: false
jobs:
autobump-and-tag:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
# Shallow clone — depth 1 is enough for the workspace-diff check.
# Tags needed for the collision check below are fetched explicitly
# in the next step, bypassing the runner-network timeout that
# full-history fetch triggers on Gitea Actions runners
# (runbooks/gitea-operational-quirks.md §runner-network-isolation).
fetch-depth: 1
- name: Fetch tags for collision check
# fetch-depth: 1 gets only the most recent commit's refs, not the
# tag that points at it. Do a targeted tag fetch so git tag --list
# below can detect collision with prior manual pushes.
run: git fetch origin --tags --depth=1
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.11"
- name: Compute next version from PyPI latest
id: bump
run: |
set -eu
LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
| python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
MAJOR=$(echo "$LATEST" | cut -d. -f1)
MINOR=$(echo "$LATEST" | cut -d. -f2)
PATCH=$(echo "$LATEST" | cut -d. -f3)
VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
echo "PyPI latest=$LATEST -> next=$VERSION"
if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+$'; then
echo "::error::computed version $VERSION does not match PEP 440 X.Y.Z"
exit 1
fi
if git tag --list | grep -qx "runtime-v$VERSION"; then
echo "::error::tag runtime-v$VERSION already exists in this repo. Manual intervention required (PyPI and Gitea tag history are out of sync)."
exit 1
fi
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
- name: Push runtime-v$VERSION tag
env:
DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
VERSION: ${{ steps.bump.outputs.version }}
GITEA_URL: https://git.moleculesai.app
run: |
set -eu
if [ -z "$DISPATCH_TOKEN" ]; then
echo "::error::DISPATCH_TOKEN secret is not set — needed to push the tag back to molecule-core."
exit 1
fi
git config user.name "publish-runtime autobump"
git config user.email "publish-runtime@moleculesai.app"
git tag -a "runtime-v$VERSION" \
-m "Auto-bump on workspace/** edit on $GITHUB_REF" \
-m "Triggered by: $GITHUB_REF @ $GITHUB_SHA" \
-m "publish-runtime.yml will pick up this tag and upload to PyPI"
# Push via DISPATCH_TOKEN (a Gitea PAT). Using the bot identity
# ensures the resulting tag-push event is dispatched to
# publish-runtime.yml; act_runner's default GITHUB_TOKEN cannot
# trigger downstream workflows.
git remote set-url origin "${GITEA_URL#https://}"
git remote set-url origin "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/molecule-ai/molecule-core.git"
git push origin "runtime-v$VERSION"
echo "✓ pushed runtime-v$VERSION — publish-runtime.yml should fire next"

View File

@ -1,339 +0,0 @@
name: publish-runtime
# Gitea Actions port of .github/workflows/publish-runtime.yml.
#
# Ported 2026-05-10 (issue #206). Key differences from the GitHub version:
# - Gitea Actions reads .gitea/workflows/, not .github/workflows/
# - Dropped `environment: pypi-publish` — Gitea Actions does not support
# named environments or OIDC trusted publishers
# - Replaced `pypa/gh-action-pypi-publish@release/v1` (OIDC) with
# `twine upload` using PYPI_TOKEN secret — same mechanism as a local
# `python -m twine upload` with a PyPI token
# - Replaced `github.ref_name` (GitHub-only) with `${GITHUB_REF#refs/tags/}`
# — Gitea Actions exposes github.ref (the full ref) but not ref_name
# - Dropped `merge_group` trigger (Gitea has no merge queue)
#
# 2026-05-10 (issue #348): originally restored `staging`/`main` branch +
# `workspace/**` path-filter trigger in PR #349.
#
# 2026-05-11 (issue #351): REVERTED the branches+paths trigger from THIS
# file. Bundling `paths` with `tags` under a single `on.push` key caused
# Gitea Actions to never dispatch the workflow for tag-push events (0
# runs in `action_run` for workflow_id='publish-runtime.yml' since the
# port, including the runtime-v1.0.0 tag — which is why PyPI is still at
# 0.1.129 despite a v1.0.0 Gitea tag existing).
#
# The auto-bump-on-workspace-edit trigger now lives in
# `.gitea/workflows/publish-runtime-autobump.yml`. That file computes the
# next version from PyPI's latest and pushes a `runtime-v$VERSION` tag,
# which THIS file then picks up via the tags-only trigger below.
#
# This decoupling means Gitea's path-vs-tag evaluator never has to
# disambiguate — each file has a single unambiguous trigger shape.
#
# PyPI publishing: requires PYPI_TOKEN repository secret (or org-level secret).
# Set via: repo Settings → Actions → Variables and Secrets → New Secret.
# The token should be a PyPI API token scoped to molecule-ai-workspace-runtime.
#
# The DISPATCH_TOKEN cascade (git push to template repos) is unchanged —
# it uses the Gitea API directly and was already Gitea-compatible.
on:
push:
tags:
- "runtime-v*"
workflow_dispatch:
# 2026-05-11 (root cause of #351 / 0 runs ever):
# Gitea 1.22.6's workflow parser rejects `workflow_dispatch.inputs.version`
# with "unknown on type" — it mis-treats the inputs sub-keys as top-level
# `on:` event types. Log line:
# actions/workflows.go:DetectWorkflows() [W] ignore invalid workflow
# "publish-runtime.yml": unknown on type: map["version": {...}]
# That `[W] ignore invalid workflow` is silent UX — the workflow never
# registers, so it never fires for ANY event (push.tags included).
# Removing the inputs block restores parsing. Manual dispatch from the
# Gitea UI now triggers the PyPI auto-bump fallback in `Derive version`
# below (no `inputs.version` to read).
permissions:
contents: read
# Serialize publishes so two concurrent tag pushes don't both compute
# "latest+1" and race on PyPI upload. The second one waits.
concurrency:
group: publish-runtime
cancel-in-progress: false
jobs:
publish:
runs-on: ubuntu-latest
outputs:
version: ${{ steps.version.outputs.version }}
wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.11"
cache: pip
- name: Derive version (tag or PyPI auto-bump)
id: version
run: |
if echo "$GITHUB_REF" | grep -q "^refs/tags/runtime-v"; then
# Tag is `runtime-vX.Y.Z` — strip the prefix.
VERSION="${GITHUB_REF#refs/tags/runtime-v}"
else
# workflow_dispatch path (no inputs supported on Gitea 1.22.6) or
# any other non-tag trigger: derive from PyPI latest + patch bump.
LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
| python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
MAJOR=$(echo "$LATEST" | cut -d. -f1)
MINOR=$(echo "$LATEST" | cut -d. -f2)
PATCH=$(echo "$LATEST" | cut -d. -f3)
VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
echo "Auto-bumped from PyPI latest $LATEST -> $VERSION"
fi
if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
echo "::error::version $VERSION does not match PEP 440"
exit 1
fi
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
echo "Publishing molecule-ai-workspace-runtime $VERSION"
- name: Install build tooling
run: pip install build twine
- name: Build package from workspace/
run: |
python scripts/build_runtime_package.py \
--version "${{ steps.version.outputs.version }}" \
--out "${{ runner.temp }}/runtime-build"
- name: Build wheel + sdist
working-directory: ${{ runner.temp }}/runtime-build
run: python -m build
- name: Capture wheel SHA256 for cascade content-verification
id: wheel_hash
working-directory: ${{ runner.temp }}/runtime-build
run: |
set -eu
WHEEL=$(ls dist/*.whl 2>/dev/null | head -1)
if [ -z "$WHEEL" ]; then
echo "::error::No .whl in dist/ — \`python -m build\` must have failed silently"
exit 1
fi
HASH=$(sha256sum "$WHEEL" | awk '{print $1}')
echo "wheel_sha256=${HASH}" >> "$GITHUB_OUTPUT"
echo "Local wheel SHA256 (pre-upload): ${HASH}"
echo "Wheel filename: $(basename "$WHEEL")"
- name: Verify package contents (sanity)
working-directory: ${{ runner.temp }}/runtime-build
run: |
python -m twine check dist/*
python -m venv /tmp/smoke
/tmp/smoke/bin/pip install --quiet dist/*.whl
/tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
- name: Publish to PyPI
# working-directory matches the preceding Build/Verify steps. Without
# this, twine runs from the default workspace checkout dir where
# `dist/` doesn't exist and fails with:
# ERROR InvalidDistribution: Cannot find file (or expand pattern): 'dist/*'
# Caught on the first-ever successful dispatch of this workflow
# (run 5097, 2026-05-11 02:08Z) — every other step in the publish
# job already had this working-directory; Publish was missing it.
working-directory: ${{ runner.temp }}/runtime-build
env:
# PYPI_TOKEN: repository secret scoped to molecule-ai-workspace-runtime.
# Set via: Settings → Actions → Variables and Secrets → New Secret.
# Format: pypi-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
if [ -z "$PYPI_TOKEN" ]; then
echo "::error::PYPI_TOKEN secret is not set — set it at Settings → Actions → Variables and Secrets → New Secret."
echo "::error::Required format: pypi-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
exit 1
fi
python -m twine upload \
--repository pypi \
--username __token__ \
--password "$PYPI_TOKEN" \
dist/*
cascade:
needs: publish
runs-on: ubuntu-latest
steps:
- name: Wait for PyPI to propagate the new version
env:
RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
EXPECTED_SHA256: ${{ needs.publish.outputs.wheel_sha256 }}
run: |
set -eu
if [ -z "$EXPECTED_SHA256" ]; then
echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
exit 1
fi
python -m venv /tmp/propagation-probe
PROBE=/tmp/propagation-probe/bin
$PROBE/pip install --upgrade --quiet pip
for i in $(seq 1 30); do
if $PROBE/pip install \
--quiet \
--no-cache-dir \
--force-reinstall \
--no-deps \
"molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
>/dev/null 2>&1; then
INSTALLED=$($PROBE/pip show molecule-ai-workspace-runtime 2>/dev/null \
| awk -F': ' '/^Version:/{print $2}')
if [ "$INSTALLED" = "$RUNTIME_VERSION" ]; then
echo "✓ PyPI resolved $RUNTIME_VERSION (install check)"
break
fi
fi
if [ $i -eq 30 ]; then
echo "::error::pip install --no-cache-dir molecule-ai-workspace-runtime==${RUNTIME_VERSION} never resolved within ~5 min."
echo "::error::Refusing to fan out cascade against a potentially stale PyPI index."
exit 1
fi
echo " [$i/30] waiting for PyPI to propagate ${RUNTIME_VERSION}..."
sleep 4
done
# Stage (b): download wheel + SHA256 compare against what we built.
# Catches Fastly stale-content serving old bytes under a new version URL.
#
# Caught run 5196 (first-ever successful publish, 2026-05-11): the
# previous one-liner `HASH=$(pip download ... && sha256sum ...)`
# captured pip's stdout (`Collecting molecule-ai-workspace-runtime
# ==X.Y.Z`) into HASH, then the SHA comparison failed against the
# leaked `Collecting...` string. `2>/dev/null` silences stderr but
# NOT stdout; pip writes its progress to stdout by default.
# Fix: split into two steps, silence pip's stdout explicitly, capture
# only sha256sum's output into HASH.
python -m pip download \
--no-deps \
--no-cache-dir \
--dest /tmp/wheel-probe \
--quiet \
"molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
>/dev/null 2>&1
HASH=$(sha256sum /tmp/wheel-probe/*.whl | awk '{print $1}')
if [ "$HASH" != "$EXPECTED_SHA256" ]; then
echo "::error::PyPI propagated $RUNTIME_VERSION but wheel content SHA256 mismatch."
echo "::error::Expected: $EXPECTED_SHA256"
echo "::error::Got: $HASH"
echo "::error::Fastly may be serving stale content. Refusing to fan out cascade."
exit 1
fi
echo "✓ PyPI CDN verified (SHA256 match)"
- name: Fan out via push to .runtime-version
env:
# Gitea PAT with write:repository scope on the 8 cascade-active
# template repos. Used for git push to each template repo's main
# branch, which trips their `on: push: branches: [main]` trigger
# on publish-image.yml.
DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
run: |
set +e # don't abort on a single repo failure — collect them all
if [ -z "$DISPATCH_TOKEN" ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
echo "::warning::set it at Settings → Actions → Variables and Secrets → New Secret."
exit 0
fi
echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version."
exit 1
fi
VERSION="$RUNTIME_VERSION"
if [ -z "$VERSION" ]; then
echo "::error::publish job did not expose a version output"
exit 1
fi
GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
FAILED=""
SKIPPED=""
git config --global user.name "publish-runtime cascade"
git config --global user.email "publish-runtime@moleculesai.app"
WORKDIR="$(mktemp -d)"
for tpl in $TEMPLATES; do
REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
CLONE="$WORKDIR/$tpl"
HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
-H "Authorization: token $DISPATCH_TOKEN" \
"$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
if [ "$HTTP" = "404" ]; then
echo "↷ $tpl has no publish-image.yml — soft-skip"
SKIPPED="$SKIPPED $tpl"
continue
fi
attempt=0
success=false
while [ $attempt -lt 3 ]; do
attempt=$((attempt + 1))
rm -rf "$CLONE"
if ! git clone --depth=1 \
"https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
"$CLONE" >/tmp/clone.log 2>&1; then
echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
sleep 2
continue
fi
cd "$CLONE"
echo "$VERSION" > .runtime-version
if git diff --quiet -- .runtime-version; then
echo "✓ $tpl already at $VERSION — no commit needed"
success=true
cd - >/dev/null
break
fi
git add .runtime-version
git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
-m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
>/dev/null
if git push origin HEAD:main >/tmp/push.log 2>&1; then
echo "✓ $tpl pushed $VERSION on attempt $attempt"
success=true
cd - >/dev/null
break
fi
echo "::warning::push $tpl attempt $attempt failed, pull-rebasing"
git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
cd - >/dev/null
done
if [ "$success" != "true" ]; then
FAILED="$FAILED $tpl"
fi
done
rm -rf "$WORKDIR"
if [ -n "$FAILED" ]; then
echo "::error::Cascade incomplete after 3 retries each. Failed:$FAILED"
exit 1
fi
if [ -n "$SKIPPED" ]; then
echo "Cascade complete: pinned $VERSION. Soft-skipped (no publish-image.yml):$SKIPPED"
else
echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
fi

View File

@ -1,174 +0,0 @@
name: publish-workspace-server-image
# Gitea Actions port of .github/workflows/publish-workspace-server-image.yml.
#
# Ported 2026-05-10 (issue #228). Key differences from the GitHub version:
# - Gitea Actions reads .gitea/workflows/, not .github/workflows/
# - Dropped `environment:` declarations — Gitea Actions does not support
# named environments (used by GitHub OIDC token gates)
# - Replaced `github.ref_name` (GitHub-only) with `${GITHUB_REF#refs/heads/}`
# — Gitea Actions exposes GITHUB_REF in the same format as GitHub Actions
# - docker/setup-buildx-action and aws-actions/configure-aws-credentials are
# GitHub Marketplace actions; they are installed by Gitea Actions runners and
# work identically here
# - All other variables (GITHUB_SHA, GITHUB_REPOSITORY, GITHUB_OUTPUT,
# secrets.*) use the same syntax as GitHub Actions
#
# Image tags produced:
# :staging-<sha> — per-commit digest, stable for canary verify
# :staging-latest — tracks most recent build on this branch
#
# ECR target: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*
# Required secrets: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AUTO_SYNC_TOKEN
on:
push:
branches: [main]
paths:
- 'workspace-server/**'
- 'canvas/**'
- 'manifest.json'
- 'scripts/**'
- '.gitea/workflows/publish-workspace-server-image.yml'
workflow_dispatch:
# Serialize per-branch so two rapid staging pushes don't race the same
# :staging-latest tag retag. Allow staging and main to run in parallel
# (different GITHUB_REF → different concurrency group) since they
# produce different :staging-<sha> tags and last-write-wins on
# :staging-latest is acceptable across branches.
#
# cancel-in-progress: false → in-flight builds finish; the next push's
# build queues. This avoids a partially-pushed image.
concurrency:
group: publish-workspace-server-image-${{ github.ref }}
cancel-in-progress: false
permissions:
contents: read
packages: write
env:
IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
jobs:
build-and-push:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
# Health check: verify Docker daemon is accessible before attempting any
# build steps. This fails loudly at step 1 when the runner's docker.sock
# is inaccessible (e.g. permission change, daemon restart, or group-membership
# drift) rather than silently continuing to step 2 where `docker build`
# fails deep in the process with a cryptic ECR auth error that doesn't
# surface the root cause. Also reports the daemon version so operator
# can correlate with runner host logs.
- name: Verify Docker daemon access
run: |
set -euo pipefail
echo "::group::Docker daemon health check"
docker info 2>&1 | head -5 || {
echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
echo "::error::Check: (1) daemon is running, (2) runner user is in docker group, (3) sock permissions are 660+"
exit 1
}
echo "Docker daemon OK"
echo "::endgroup::"
# Pre-clone manifest deps before docker build.
#
# Why: workspace-template-* repos on Gitea are private. The pre-fix
# Dockerfile.tenant ran `git clone` inside an in-image stage with no
# auth path — every CI build failed. We clone in the trusted CI
# context where AUTO_SYNC_TOKEN is available and Dockerfile.tenant
# just COPYs from .tenant-bundle-deps/.
#
# Token: AUTO_SYNC_TOKEN is the devops-engineer persona PAT.
# clone-manifest.sh embeds it as basic-auth for the clones, then
# strips .git dirs — the token never enters the image.
- name: Pre-clone manifest deps
env:
MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
run: |
set -euo pipefail
if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
echo "::error::AUTO_SYNC_TOKEN secret is empty"
exit 1
fi
mkdir -p .tenant-bundle-deps
bash scripts/clone-manifest.sh \
manifest.json \
.tenant-bundle-deps/workspace-configs-templates \
.tenant-bundle-deps/org-templates \
.tenant-bundle-deps/plugins
ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
- name: Compute tags
id: tags
run: |
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
# Build + push platform image (inline ECR auth — mirrors the operator-host
# approach; credentials come from GITHUB_SECRET_AWS_ACCESS_KEY_ID /
# GITHUB_SECRET_AWS_SECRET_ACCESS_KEY in Gitea Actions).
- name: Build & push platform image to ECR (staging-<sha> + staging-latest)
env:
IMAGE_NAME: ${{ env.IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
TAG_LATEST: staging-latest
GIT_SHA: ${{ github.sha }}
REPO: ${{ github.repository }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
run: |
set -euo pipefail
ECR_REGISTRY="${IMAGE_NAME%%/*}"
aws ecr get-login-password --region us-east-2 | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
docker build \
--file ./workspace-server/Dockerfile \
--build-arg GIT_SHA="${GIT_SHA}" \
--label "org.opencontainers.image.source=https://github.com/${REPO}" \
--label "org.opencontainers.image.revision=${GIT_SHA}" \
--label "org.opencontainers.image.description=Molecule AI platform — pending canary verify" \
--tag "${IMAGE_NAME}:${TAG_SHA}" \
--tag "${IMAGE_NAME}:${TAG_LATEST}" \
.
docker push "${IMAGE_NAME}:${TAG_SHA}"
docker push "${IMAGE_NAME}:${TAG_LATEST}"
# Build + push tenant image (Go platform + Next.js canvas in one image).
- name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
env:
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
TAG_LATEST: staging-latest
GIT_SHA: ${{ github.sha }}
REPO: ${{ github.repository }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
run: |
set -euo pipefail
ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
aws ecr get-login-password --region us-east-2 | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
docker build \
--file ./workspace-server/Dockerfile.tenant \
--build-arg NEXT_PUBLIC_PLATFORM_URL= \
--build-arg GIT_SHA="${GIT_SHA}" \
--label "org.opencontainers.image.source=https://github.com/${REPO}" \
--label "org.opencontainers.image.revision=${GIT_SHA}" \
--label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \
--tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
--tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
.
docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}"
docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"

View File

@ -1,181 +0,0 @@
name: Railway pin audit (drift detection)
# Ported from .github/workflows/railway-pin-audit.yml on 2026-05-11 per
# RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - Dropped `workflow_dispatch:` (Gitea 1.22.6 trigger handling).
# Manual runs go via cron-trigger bump or push the workflow file
# itself.
# - `actions/github-script@v9` blocks (which call github.rest.* — a
# GitHub-specific JS API) replaced with curl calls against the
# Gitea REST API (/api/v1/repos/.../issues, .../labels,
# .../comments). Same behaviour: open issue on drift, comment on
# repeat-drift, close on clean run.
# - Workflow-level env.GITHUB_SERVER_URL set so the curl calls can
# derive `git.moleculesai.app` from the runner env (with
# hard-coded fallback inside the steps).
# - `continue-on-error: true` on the job (RFC §1 contract).
#
# Daily audit of Railway env vars for drift-prone image-tag pins —
# automation-cadence layer over the detection script + regression test
# shipped in PR #2168 (#2001 closure).
#
# Background: on 2026-04-24 a stale `:staging-a14cf86` SHA pin in CP's
# TENANT_IMAGE caused 3+ hours of E2E failure with the appearance that
# "every fix didn't propagate" — really the tenant image was so old it
# didn't read the env vars those fixes produced.
#
# Cadence: once a day, 13:00 UTC (06:00 PT).
#
# Secret hardening: per feedback_schedule_vs_dispatch_secrets_hardening,
# the schedule trigger HARD-FAILS on missing RAILWAY_AUDIT_TOKEN.
on:
schedule:
- cron: '0 13 * * *'
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
concurrency:
group: railway-pin-audit
cancel-in-progress: false
permissions:
issues: write
contents: read
jobs:
audit:
name: Audit Railway env vars for drift-prone pins
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 10
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify RAILWAY_AUDIT_TOKEN present
env:
RAILWAY_AUDIT_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
id: secret_check
run: |
set -euo pipefail
if [ -n "${RAILWAY_AUDIT_TOKEN:-}" ]; then
echo "have_secret=true" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "have_secret=false" >> "$GITHUB_OUTPUT"
echo "::error::RAILWAY_AUDIT_TOKEN secret missing — schedule trigger requires it. Provision the token (read-only \`variables\` scope on the molecule-platform Railway project) and store as repo secret RAILWAY_AUDIT_TOKEN."
exit 1
- name: Install Railway CLI
if: steps.secret_check.outputs.have_secret == 'true'
run: |
set -euo pipefail
curl -fsSL https://railway.com/install.sh | sh
echo "$HOME/.railway/bin" >> "$GITHUB_PATH"
- name: Verify Railway CLI authenticated
if: steps.secret_check.outputs.have_secret == 'true'
env:
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
run: |
set -euo pipefail
if ! railway whoami >/dev/null 2>&1; then
echo "::error::Railway CLI failed to authenticate with RAILWAY_AUDIT_TOKEN — token may be revoked or scoped incorrectly"
exit 2
fi
- name: Link molecule-platform project
if: steps.secret_check.outputs.have_secret == 'true'
env:
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
run: |
set -euo pipefail
railway link --project 7ccc8c68-61f4-42ab-9be5-586eeee11768
- name: Run drift audit
if: steps.secret_check.outputs.have_secret == 'true'
id: audit
env:
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
run: |
set +e
bash scripts/ops/audit-railway-sha-pins.sh 2>&1 | tee /tmp/audit.log
rc=${PIPESTATUS[0]}
echo "rc=$rc" >> "$GITHUB_OUTPUT"
# Capture the audit log for the issue body.
{
echo 'log<<AUDIT_EOF'
cat /tmp/audit.log
echo 'AUDIT_EOF'
} >> "$GITHUB_OUTPUT"
case "$rc" in
0) exit 0 ;;
1) echo "::warning::Drift-prone pin(s) detected — issue will be filed"; exit 1 ;;
2) echo "::error::Railway CLI auth/link failed mid-script — token or project ID drift"; exit 2 ;;
*) echo "::error::Unexpected audit rc=$rc"; exit 1 ;;
esac
- name: Open / update drift issue (Gitea API)
if: failure() && steps.audit.outputs.rc == '1'
env:
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
AUDIT_LOG: ${{ steps.audit.outputs.log }}
SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
RUN_ID: ${{ github.run_id }}
run: |
set -euo pipefail
API="${SERVER_URL%/}/api/v1"
TITLE="Railway env-var drift detected"
RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
BODY=$(jq -nc --arg t "$TITLE" --arg log "${AUDIT_LOG:-(log unavailable)}" --arg run "$RUN_URL" '
{body: ("Daily Railway pin audit found drift-prone image-tag pins in the molecule-platform Railway project.\n\n**What this means:** an env var (likely on `controlplane`) is pinned to a SHA-shaped or semver tag instead of a floating tag. Same pattern that caused the 2026-04-24 TENANT_IMAGE incident — fix-PRs land but the running service does not pick them up.\n\n**Recovery:** open the Railway dashboard, replace the flagged value with a floating tag (:staging-latest, :main) unless the pin is intentional and documented in the ops runbook.\n\n**Audit output:**\n\n```\n" + $log + "\n```\n\nRun: " + $run + "\n\nCloses automatically when a subsequent daily run reports clean.")}')
# Look for existing open drift issue with the title.
EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
"${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
| jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
if [ -n "$EXISTING" ]; then
COMMENT_BODY=$(jq -nc --arg log "${AUDIT_LOG:-(log unavailable)}" --arg run "$RUN_URL" \
'{body: ("Still drifting. " + $run + "\n\n```\n" + $log + "\n```")}')
curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues/${EXISTING}/comments" -d "$COMMENT_BODY" >/dev/null
echo "Commented on existing issue #${EXISTING}"
else
CREATE_BODY=$(echo "$BODY" | jq --arg t "$TITLE" '. + {title: $t, labels: []}')
NUM=$(curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues" -d "$CREATE_BODY" | jq -r .number)
echo "Filed issue #${NUM}"
fi
- name: Close stale drift issue on clean run (Gitea API)
if: success() && steps.audit.outputs.rc == '0'
env:
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
RUN_ID: ${{ github.run_id }}
run: |
set -euo pipefail
API="${SERVER_URL%/}/api/v1"
TITLE="Railway env-var drift detected"
RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
NUMS=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
"${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
| jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number')
for N in $NUMS; do
curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues/${N}/comments" \
-d "$(jq -nc --arg run "$RUN_URL" '{body: ("Daily audit clean — drift resolved. " + $run)}')" >/dev/null
curl -fsS -X PATCH -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues/${N}" -d '{"state":"closed"}' >/dev/null
echo "Closed #${N}"
done

View File

@ -1,375 +0,0 @@
name: redeploy-tenants-on-main
# Ported from .github/workflows/redeploy-tenants-on-main.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
# - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support
# for the `workflow_run` event is partial. If this never fires on a
# real publish-workspace-server-image completion, the follow-up
# triage PR should replace the trigger with a push-with-paths-filter
# on .gitea/workflows/publish-workspace-server-image.yml. Until
# then continue-on-error+dead-workflow doesn't break anything.
#
# Auto-refresh prod tenant EC2s after every main merge.
#
# Why this workflow exists: publish-workspace-server-image builds and
# pushes a new platform-tenant :<sha> to ECR on every merge to main,
# but running tenants pulled their image once at boot and never re-pull.
# Users see stale code indefinitely.
#
# This workflow closes the gap by calling the control-plane admin
# endpoint that performs a canary-first, batched, health-gated rolling
# redeploy across every live tenant. Implemented in molecule-ai/
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
# (feat/tenant-auto-redeploy, landing alongside this workflow).
#
# Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/
# molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the
# Gitea suspension migration. The staging-verify.yml promote step now
# uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap).
#
# Runtime ordering:
# 1. publish-workspace-server-image completes → new :staging-<sha> in ECR.
# 2. This workflow fires via workflow_run, calls redeploy-fleet with
# target_tag=staging-<sha>. No CDN propagation wait needed —
# ECR image manifest is consistent immediately after push.
# 3. Calls redeploy-fleet with canary_slug (if set) and a soak
# period. Canary proves the image boots; batches follow.
# 4. Any failure aborts the rollout and leaves older tenants on the
# prior image — safer default than half-and-half state.
#
# Rollback path: re-run this workflow with a specific SHA pinned via
# the workflow_dispatch input. That calls redeploy-fleet with
# target_tag=<sha>, re-pulling the older image on every tenant.
on:
workflow_run:
workflows: ['publish-workspace-server-image']
types: [completed]
branches: [main]
permissions:
contents: read
# No write scopes needed — the workflow hits an external CP endpoint,
# not the GitHub API.
# Serialize redeploys so two rapid main pushes' redeploys don't overlap
# and cause confusing per-tenant SSM state. Without this, GitHub's
# implicit workflow_run queueing would *probably* serialize them, but
# the explicit block makes the invariant defensible. Mirrors the
# concurrency block on redeploy-tenants-on-staging.yml for shape parity.
#
# cancel-in-progress: false → aborting a half-rolled-out fleet would
# leave tenants stuck on whatever image they happened to be on when
# cancelled. Better to finish the in-flight rollout before starting
# the next one.
concurrency:
group: redeploy-tenants-on-main
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
redeploy:
# Skip the auto-trigger if publish-workspace-server-image didn't
# actually succeed. workflow_run fires on any completion state; we
# don't want to redeploy against a half-built image.
# NOTE (Gitea port): workflow_dispatch trigger dropped; only the
# workflow_run path remains.
if: ${{ github.event.workflow_run.conclusion == 'success' }}
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 25
steps:
- name: Note on ECR propagation
# ECR image manifests are consistent immediately after push — no
# CDN cache to wait for. The old GHCR-based workflow had a 30s
# sleep to avoid race conditions; ECR makes that unnecessary.
run: echo "ECR image available immediately after push — proceeding."
- name: Compute target tag
id: tag
# Resolution order:
# 1. Operator-supplied input (workflow_dispatch with explicit
# tag) → used verbatim. Lets ops pin `latest` for emergency
# rollback to last canary-verified digest, or pin a specific
# `staging-<sha>` to roll back to a known-good build.
# 2. Default → `staging-<short_head_sha>`. The just-published
# digest. Bypasses the `:latest` retag path that's currently
# dead (staging-verify soft-skips without canary fleet, so
# the only thing retagging `:latest` today is the manual
# promote-latest.yml — last run 2026-04-28). Auto-trigger
# from workflow_run uses workflow_run.head_sha; manual
# dispatch with no input falls through to github.sha.
env:
INPUT_TAG: ${{ inputs.target_tag }}
HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
run: |
set -euo pipefail
if [ -n "${INPUT_TAG:-}" ]; then
echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT"
echo "Using operator-pinned tag: $INPUT_TAG"
else
SHORT="${HEAD_SHA:0:7}"
echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT"
echo "Using auto tag: staging-$SHORT (head_sha=$HEAD_SHA)"
fi
- name: Call CP redeploy-fleet
# CP_ADMIN_API_TOKEN must be set as a repo/org secret on
# molecule-ai/molecule-core, matching the staging/prod CP's
# CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
# repo's secrets for CI.
env:
CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }}
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
DRY_RUN: ${{ inputs.dry_run || false }}
run: |
set -euo pipefail
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
exit 1
fi
BODY=$(jq -nc \
--arg tag "$TARGET_TAG" \
--arg canary "$CANARY_SLUG" \
--argjson soak "$SOAK_SECONDS" \
--argjson batch "$BATCH_SIZE" \
--argjson dry "$DRY_RUN" \
'{
target_tag: $tag,
canary_slug: $canary,
soak_seconds: $soak,
batch_size: $batch,
dry_run: $dry
}')
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " body: $BODY"
HTTP_RESPONSE=$(mktemp)
HTTP_CODE_FILE=$(mktemp)
# Route -w into its own tempfile so curl's exit code (e.g. 56
# on connection-reset, 22 on --fail-with-body 4xx/5xx) can't
# pollute the captured stdout. The previous inline-substitution
# shape produced "000000" on connection reset (curl wrote
# "000" via -w, then the inline echo-fallback appended another
# "000") — caught on the 2026-05-04 redeploy of sha 2b862f6.
# set +e/-e keeps the non-zero curl exit from tripping the
# outer pipeline. See lint-curl-status-capture.yml for the
# CI gate that pins this fix shape.
set +e
curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" >"$HTTP_CODE_FILE"
set -e
# Stderr from curl (e.g. dial errors with -sS) goes to the runner
# log so operators can see WHY a connection failed. Stdout is
# captured to $HTTP_CODE_FILE because that's where -w writes.
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
echo "HTTP $HTTP_CODE"
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
# Pretty-print per-tenant results in the job summary so
# ops can see which tenants were redeployed without drilling
# into the raw response.
{
echo "## Tenant redeploy fleet"
echo ""
echo "**Target tag:** \`$TARGET_TAG\`"
echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
echo "**Batch size:** $BATCH_SIZE"
echo "**Dry run:** $DRY_RUN"
echo "**HTTP:** $HTTP_CODE"
echo ""
echo "### Per-tenant result"
echo ""
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
echo '|------|-------|------------|------|---------|-------|'
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
} >> "$GITHUB_STEP_SUMMARY"
if [ "$HTTP_CODE" != "200" ]; then
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
exit 1
fi
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
if [ "$OK" != "true" ]; then
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
exit 1
fi
echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
# Stash the response for the verify step. $RUNNER_TEMP outlasts
# the step boundary; $HTTP_RESPONSE doesn't.
cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
- name: Verify each tenant /buildinfo matches published SHA
# ROOT FIX FOR #2395.
#
# `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC
# didn't error" — NOT "the new image is running on the tenant."
# `:latest` lives in the local Docker daemon's image cache; if
# the SSM document does `docker compose up -d` without an
# explicit `docker pull`, the daemon serves the previously-
# cached digest and the container restarts on stale code.
# 2026-04-30 incident: hongmingwang's tenant reported
# ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7
# chat_files for 30+ min — the lazy-heal fix never reached the
# user despite green deploy + green redeploy.
#
# This step closes the gap by curling each tenant's /buildinfo
# endpoint (added in workspace-server/internal/buildinfo +
# /Dockerfile* GIT_SHA build-arg, this PR) and comparing the
# returned git_sha to the SHA the workflow expects. Mismatches
# fail the workflow, which is what `ok=true` should have
# guaranteed all along.
#
# When the redeploy was triggered by workflow_dispatch with a
# specific tag (target_tag != "latest"), the expected SHA may
# not equal ${{ github.sha }} — in that case we resolve via
# GHCR's manifest. For workflow_run (default :latest) the
# workflow_run.head_sha is the SHA that just published.
env:
EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
# Tenant subdomain template — slugs from the response are
# appended. Production CP issues `<slug>.moleculesai.app`;
# staging CP issues `<slug>.staging.moleculesai.app`. This
# workflow runs on main → prod CP → no `staging.` infix.
TENANT_DOMAIN: 'moleculesai.app'
run: |
set -euo pipefail
EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
if [ "$TARGET_TAG" != "latest" ] \
&& [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \
&& [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then
# workflow_dispatch with a pinned tag that isn't the head
# SHA — operator is rolling back / pinning. Skip the
# verification because we don't have the expected SHA in
# this context (would need to crane-inspect the GHCR
# manifest, which is a follow-up). Failing-open here is
# safe: the operator chose the tag deliberately.
#
# `staging-<short_head_sha>` IS verified — it's the new
# auto-trigger default (see Compute target tag step) and
# the digest under that tag SHOULD match EXPECTED_SHA.
echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
exit 0
fi
RESP="$RUNNER_TEMP/redeploy-response.json"
if [ ! -s "$RESP" ]; then
echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read"
exit 1
fi
# Pull only successfully-redeployed tenants. Any tenant that
# halted the rollout already failed the previous step, so we
# don't double-count them here.
mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
if [ ${#SLUGS[@]} -eq 0 ]; then
echo "::warning::No tenants reported healthz_ok — nothing to verify"
exit 0
fi
echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
# Two distinct failure modes — STALE (the #2395 bug class, hard-fail)
# vs UNREACHABLE (teardown race, soft-warn). See the staging variant's
# comment for the full rationale; same logic applies on prod even
# though prod has fewer ephemeral tenants — the asymmetry would be a
# gratuitous fork.
STALE_COUNT=0
UNREACHABLE_COUNT=0
STALE_LINES=()
UNREACHABLE_LINES=()
for slug in "${SLUGS[@]}"; do
URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
# 30s total: tenant just SSM-restarted, may still be coming
# up. Retry-on-empty rather than retry-on-status — we want
# to fail fast on "responded with wrong SHA", not "still
# warming up".
BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
if [ -z "$ACTUAL_SHA" ]; then
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
continue
fi
if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
echo " $slug: ${ACTUAL_SHA:0:7} ✓"
else
STALE_COUNT=$((STALE_COUNT + 1))
STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
fi
done
{
echo ""
echo "### Per-tenant /buildinfo verification"
echo ""
echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
echo ""
if [ $STALE_COUNT -gt 0 ]; then
echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
echo ""
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
echo "|------|----------------------|----------|--------|"
for line in "${STALE_LINES[@]}"; do echo "$line"; done
echo ""
fi
if [ $UNREACHABLE_COUNT -gt 0 ]; then
echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely teardown race (soft-warn, not failing):**"
echo ""
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
echo "|------|----------------------|----------|--------|"
for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
echo ""
fi
if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓"
fi
} >> "$GITHUB_STEP_SUMMARY"
if [ $UNREACHABLE_COUNT -gt 0 ]; then
echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
fi
# Belt-and-suspenders sanity floor: same logic as the staging
# variant — see that file's comment for the full rationale.
# Floor only applies when fleet >= 4; below that, staging-verify
# is the actual gate.
TOTAL_VERIFIED=${#SLUGS[@]}
if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
exit 1
fi
if [ $STALE_COUNT -gt 0 ]; then
echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
exit 1
fi
echo "::notice::Tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."

View File

@ -1,356 +0,0 @@
name: redeploy-tenants-on-staging
# Ported from .github/workflows/redeploy-tenants-on-staging.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
# - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support
# for the `workflow_run` event is partial. If this never fires on a
# real publish-workspace-server-image completion, the follow-up
# triage PR should replace the trigger with a push-with-paths-filter
# on .gitea/workflows/publish-workspace-server-image.yml. Until
# then continue-on-error+dead-workflow doesn't break anything.
#
# Auto-refresh staging tenant EC2s after every staging-branch merge.
#
# Mirror of redeploy-tenants-on-main.yml, with the staging-CP host and
# the :staging-latest tag. Sister workflow exists for prod (rolls
# :latest after staging-verify). Both share the same shape — just
# different CP_URL + target_tag + admin token secret.
#
# Why this workflow exists: publish-workspace-server-image now builds
# on every staging-branch push (PR #2335), pushing
# platform-tenant:staging-latest to GHCR. Existing tenants pulled
# their image once at boot and never re-pull, so the new image just
# sits unused until the tenant is reprovisioned.
#
# This workflow closes the gap by calling staging-CP's
# /cp/admin/tenants/redeploy-fleet, which performs a canary-first,
# batched, health-gated SSM redeploy across every live staging tenant.
# Same endpoint shape as prod CP — only the host differs.
#
# Runtime ordering:
# 1. publish-workspace-server-image completes on staging branch →
# new :staging-latest in GHCR.
# 2. This workflow fires via workflow_run, waits 30s for GHCR's CDN
# to propagate the new tag.
# 3. Calls redeploy-fleet with no canary (staging IS canary; we don't
# need a sub-canary inside it). Soak still applies to the first
# tenant in case of bad-deploy detection.
# 4. Any failure aborts the rollout and leaves older tenants on the
# prior image — safer default than half-and-half state.
#
# Rollback path: re-run with workflow_dispatch + target_tag=staging-<sha>
# of a known-good build.
on:
workflow_run:
workflows: ['publish-workspace-server-image']
types: [completed]
branches: [main]
permissions:
contents: read
# No write scopes needed — the workflow hits an external CP endpoint,
# not the GitHub API.
# Serialize per-branch so two rapid staging pushes' redeploys don't
# overlap and cause confusing per-tenant SSM state. cancel-in-progress
# is false because aborting a half-rolled-out fleet leaves tenants
# stuck on whatever image they happened to be on when cancelled.
concurrency:
group: redeploy-tenants-on-staging
cancel-in-progress: false
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
redeploy:
# Skip the auto-trigger if publish-workspace-server-image didn't
# actually succeed. workflow_run fires on any completion state; we
# don't want to redeploy against a half-built image.
# NOTE (Gitea port): workflow_dispatch trigger dropped; only the
# workflow_run path remains.
if: ${{ github.event.workflow_run.conclusion == 'success' }}
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 25
steps:
- name: Wait for GHCR tag propagation
# GHCR's edge cache takes ~15-30s to consistently serve the new
# :staging-latest manifest after the registry accepts the push.
# Same rationale as redeploy-tenants-on-main.yml.
run: sleep 30
- name: Call staging-CP redeploy-fleet
# CP_STAGING_ADMIN_API_TOKEN must be set as a repo/org secret
# on molecule-ai/molecule-core, matching staging-CP's
# CP_ADMIN_API_TOKEN env var (visible in Railway controlplane
# / staging environment). Stored separately from the prod
# CP_ADMIN_API_TOKEN so a leak of one doesn't auth the other.
env:
CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
CANARY_SLUG: ${{ inputs.canary_slug || '' }}
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
DRY_RUN: ${{ inputs.dry_run || false }}
run: |
set -euo pipefail
# Schedule-vs-dispatch hardening (mirrors sweep-cf-orphans
# and sweep-cf-tunnels): hard-fail on auto-trigger when the
# secret is missing so a misconfigured-repo doesn't silently
# serve stale staging tenants. Soft-skip on operator dispatch.
if [ -z "${CP_STAGING_ADMIN_API_TOKEN:-}" ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "::warning::CP_STAGING_ADMIN_API_TOKEN secret not set — skipping redeploy"
echo "::warning::Set CP_STAGING_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
echo "::notice::Pull the value from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
exit 0
fi
echo "::error::staging redeploy cannot run — CP_STAGING_ADMIN_API_TOKEN secret missing"
echo "::error::set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
exit 1
fi
BODY=$(jq -nc \
--arg tag "$TARGET_TAG" \
--arg canary "$CANARY_SLUG" \
--argjson soak "$SOAK_SECONDS" \
--argjson batch "$BATCH_SIZE" \
--argjson dry "$DRY_RUN" \
'{
target_tag: $tag,
canary_slug: $canary,
soak_seconds: $soak,
batch_size: $batch,
dry_run: $dry
}')
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " body: $BODY"
HTTP_RESPONSE=$(mktemp)
HTTP_CODE_FILE=$(mktemp)
# Route -w into its own tempfile so curl's exit code (e.g. 56
# on connection-reset) can't pollute the captured stdout. The
# previous inline-substitution shape produced "000000" on
# connection reset — caught on main variant 2026-05-04
# redeploying sha 2b862f6. Same fix shape as the synth-E2E
# §9c gate (PR #2797). See lint-curl-status-capture.yml for
# the CI gate that pins this fix shape.
set +e
curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" >"$HTTP_CODE_FILE"
set -e
# Stderr from curl (-sS shows dial errors etc.) goes to the
# runner log so operators can see WHY a connection failed.
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
echo "HTTP $HTTP_CODE"
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
{
echo "## Staging tenant redeploy fleet"
echo ""
echo "**Target tag:** \`$TARGET_TAG\`"
echo "**Canary:** \`${CANARY_SLUG:-(none — staging is itself the canary)}\` (soak ${SOAK_SECONDS}s)"
echo "**Batch size:** $BATCH_SIZE"
echo "**Dry run:** $DRY_RUN"
echo "**HTTP:** $HTTP_CODE"
echo ""
echo "### Per-tenant result"
echo ""
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
echo '|------|-------|------------|------|---------|-------|'
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
} >> "$GITHUB_STEP_SUMMARY"
# Distinguish "real fleet failure" from "E2E teardown race".
#
# CP returns HTTP 500 + ok=false whenever ANY tenant in the
# fleet failed SSM or healthz. In practice the recurring source
# of these is ephemeral test tenants being torn down by their
# parent E2E run mid-redeploy: the EC2 dies → SSM exit=2 or
# healthz timeout → CP marks the fleet failed → this workflow
# goes red even though every operator-facing tenant rolled fine.
#
# Ephemeral slug prefixes (kept in sync with sweep-stale-e2e-orgs.yml
# — see that file for the source-of-truth list and rationale):
# - e2e-* — canvas/saas/ext E2E suites
# - rt-e2e-* — runtime-test harness fixtures (RFC #2251)
# Long-lived prefixes that are NOT ephemeral and MUST hard-fail:
# demo-prep, dryrun-*, dryrun2-*, plus all human tenant slugs.
#
# Filter: if HTTP=500/ok=false AND every failed slug matches an
# ephemeral prefix, treat as soft-warn and let the verify step
# downstream handle unreachable-vs-stale (#2402). Any non-ephemeral
# failure or a non-500 HTTP response remains a hard failure.
OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE")
FAILED_SLUGS=$(jq -r '
.results[]?
| select((.healthz_ok != true) or (.ssm_status != "Success"))
| .slug' "$HTTP_RESPONSE" 2>/dev/null || true)
EPHEMERAL_PREFIX_RE='^(e2e-|rt-e2e-)'
NON_EPHEMERAL_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -Ev "$EPHEMERAL_PREFIX_RE" || true)
if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then
: # happy path — fall through to verification
elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_EPHEMERAL_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then
COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -Ec "$EPHEMERAL_PREFIX_RE" || true)
echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is ephemeral (e2e-*/rt-e2e-*) — treating as teardown race, soft-warning."
printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning:: failed: /'
elif [ "$HTTP_CODE" != "200" ]; then
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
if [ -n "$NON_EPHEMERAL_FAILED" ]; then
echo "::error::non-ephemeral tenant(s) failed:"
printf '%s\n' "$NON_EPHEMERAL_FAILED" | sed 's/^/::error:: /'
fi
exit 1
else
# HTTP=200 but ok=false (shouldn't happen with current CP
# but keep the gate for completeness).
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
exit 1
fi
echo "::notice::Staging tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
- name: Verify each staging tenant /buildinfo matches published SHA
# Mirror of the verify step in redeploy-tenants-on-main.yml — see
# there for the rationale (#2395 root fix). Staging has the same
# ssm_status-success-but-stale-image hazard and benefits from the
# same gate. Diff: TENANT_DOMAIN includes the `staging.` infix.
env:
EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
TENANT_DOMAIN: 'staging.moleculesai.app'
run: |
set -euo pipefail
# staging-latest is the staging-side moving tag; treat it the
# same way main treats `latest`. Operator-pinned SHAs skip
# verification (see main variant for why).
if [ "$TARGET_TAG" != "staging-latest" ] && [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then
echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
exit 0
fi
RESP="$RUNNER_TEMP/redeploy-response.json"
if [ ! -s "$RESP" ]; then
echo "::error::redeploy-response.json missing or empty"
exit 1
fi
mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
if [ ${#SLUGS[@]} -eq 0 ]; then
echo "::warning::No staging tenants reported healthz_ok — nothing to verify"
exit 0
fi
echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
# Two distinct failure modes here:
# STALE_COUNT — tenant returned a SHA that doesn't match. THIS is
# the #2395 bug class: tenant up + serving old code.
# Always hard-fail the workflow.
# UNREACHABLE_COUNT — tenant didn't respond. Almost always a benign
# teardown race: redeploy-fleet snapshot says
# healthz_ok=true, then the E2E suite tears the
# ephemeral tenant down before this step runs (the
# e2e-* fixtures churn 5-10/hour on staging). Soft-
# warn so we don't block staging→main on cleanup.
# Real "tenant up but unreachable" is caught by CP's
# own healthz monitor + the post-redeploy alert; we
# don't need to double-count it here.
STALE_COUNT=0
UNREACHABLE_COUNT=0
STALE_LINES=()
UNREACHABLE_LINES=()
for slug in "${SLUGS[@]}"; do
URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
if [ -z "$ACTUAL_SHA" ]; then
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
continue
fi
if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
echo " $slug: ${ACTUAL_SHA:0:7} ✓"
else
STALE_COUNT=$((STALE_COUNT + 1))
STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
fi
done
{
echo ""
echo "### Per-tenant /buildinfo verification (staging)"
echo ""
echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
echo ""
if [ $STALE_COUNT -gt 0 ]; then
echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
echo ""
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
echo "|------|----------------------|----------|--------|"
for line in "${STALE_LINES[@]}"; do echo "$line"; done
echo ""
fi
if [ $UNREACHABLE_COUNT -gt 0 ]; then
echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely E2E teardown race (soft-warn, not failing):**"
echo ""
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
echo "|------|----------------------|----------|--------|"
for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
echo ""
fi
if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
echo "All ${#SLUGS[@]} staging tenants returned matching SHA. ✓"
fi
} >> "$GITHUB_STEP_SUMMARY"
if [ $UNREACHABLE_COUNT -gt 0 ]; then
echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
fi
# Belt-and-suspenders sanity floor: if MORE than half the fleet is
# unreachable AND the fleet is large enough that "half down" is
# statistically meaningful, this is a real outage (e.g. new image
# crashes on startup), not a teardown race. Hard-fail.
#
# Floor only applies when TOTAL_VERIFIED >= 4 — below that, the
# staging-verify step is the actual gate for "all tenants down"
# detection (it runs against the canary first and aborts the
# rollout if the canary fails to come up). Without the >=4 gate,
# a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a
# quiet staging push) would re-flake on the exact teardown-race
# condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail.
TOTAL_VERIFIED=${#SLUGS[@]}
if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
exit 1
fi
if [ $STALE_COUNT -gt 0 ]; then
echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
exit 1
fi
echo "::notice::Staging tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."

View File

@ -1,100 +0,0 @@
name: Runtime Pin Compatibility
# Ported from .github/workflows/runtime-pin-compat.yml on 2026-05-11 per
# RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - Dropped `merge_group:` (no Gitea merge queue) and
# `workflow_dispatch:` (no inputs, but the trigger itself is
# parser-rejected when inputs are absent in some Gitea 1.22.x
# builds; safest to drop entirely — manual runs go via cron-trigger
# bump or push-with-paths-filter).
# - on.paths references .gitea/workflows/runtime-pin-compat.yml (this
# file) instead of the .github/ one.
# - Workflow-level env.GITHUB_SERVER_URL set.
# - `continue-on-error: true` on the job (RFC §1 contract).
#
# CI gate that prevents the 5-hour staging outage from 2026-04-24 from
# recurring (controlplane#253). The original failure mode:
# 1. molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its
# requires_dist metadata (incorrect — it actually imports
# a2a.server.routes which only exists in a2a-sdk 1.0+)
# 2. `pip install molecule-ai-workspace-runtime` resolved cleanly
# 3. `from molecule_runtime.main import main_sync` raised ImportError
# 4. Every tenant workspace crashed; the canary tenant caught it but
# only after 5 hours of degraded staging
#
# This workflow installs the CURRENTLY PUBLISHED runtime from PyPI on
# top of `workspace/requirements.txt` and smoke-imports. Catches:
# - Upstream PyPI yanks
# - Bad re-releases of molecule-ai-workspace-runtime
# - Already-shipped wheels that stop importing because a transitive
# dep moved underneath
on:
push:
branches: [main, staging]
paths:
# Narrow filter: pypi-latest is sensitive only to changes that
# affect what we're INSTALLING (requirements.txt) or WHAT THE
# CHECK ITSELF DOES (this workflow file). Edits to workspace/
# source code don't change what's on PyPI right now, so they
# don't change this gate's verdict.
- 'workspace/requirements.txt'
- '.gitea/workflows/runtime-pin-compat.yml'
pull_request:
branches: [main, staging]
paths:
- 'workspace/requirements.txt'
- '.gitea/workflows/runtime-pin-compat.yml'
# Daily catch for upstream PyPI publishes that break the pin combo
# without any change in our repo (e.g. someone re-yanks an a2a-sdk
# release or molecule-ai-workspace-runtime publishes a bad bump).
schedule:
- cron: '0 13 * * *' # 06:00 PT
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
pypi-latest-install:
name: PyPI-latest install + import smoke
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
# the PR. Follow-up PR flips this off after surfaced defects are
# triaged.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.11'
cache: pip
cache-dependency-path: workspace/requirements.txt
- name: Install runtime + workspace requirements
# Install order is load-bearing: install the runtime FIRST so pip
# honors whatever a2a-sdk constraint the runtime metadata declares
# (this is the surface that broke in 2026-04-24 — runtime declared
# `a2a-sdk<1.0` but actually needed >=1.0). The follow-up install
# of workspace/requirements.txt then upgrades a2a-sdk to the
# constraint our runtime image actually pins. The import smoke
# below verifies the upgraded combination is consistent.
run: |
python -m venv /tmp/venv
/tmp/venv/bin/pip install --upgrade pip
/tmp/venv/bin/pip install molecule-ai-workspace-runtime
/tmp/venv/bin/pip install -r workspace/requirements.txt
/tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
| grep -E '^(Name|Version):'
- name: Smoke import — fail if metadata declares deps that don't satisfy real imports
# WORKSPACE_ID is validated at import time by platform_auth.py — EC2
# user-data sets it from the cloud-init template; set a placeholder
# here so the import smoke doesn't trip on the env-var guard.
env:
WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
run: |
/tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')"

View File

@ -1,139 +0,0 @@
name: Runtime PR-Built Compatibility
# Ported from .github/workflows/runtime-prbuild-compat.yml on 2026-05-11
# per RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - Dropped `merge_group:` (no Gitea merge queue) and `workflow_dispatch:`
# (Gitea 1.22.6 parser-rejects workflow_dispatch with inputs and is
# finicky without them).
# - `dorny/paths-filter@v4` replaced with inline `git diff` (per PR#372
# pattern for ci.yml port).
# - on.paths references .gitea/workflows/runtime-prbuild-compat.yml.
# - Workflow-level env.GITHUB_SERVER_URL set.
# - `continue-on-error: true` on every job (RFC §1 contract).
#
# Companion to `runtime-pin-compat.yml`. That workflow tests what's
# CURRENTLY PUBLISHED on PyPI; this workflow tests what WOULD BE
# PUBLISHED if THIS PR merges.
#
# Why two workflows: the chicken-and-egg #128 fix added a "PR-built
# wheel" job to the original runtime-pin-compat.yml, but both jobs
# shared a `paths:` filter that was the union of their needs
# (`workspace/**`). That meant the PyPI-latest job ran on every doc
# edit even though the upstream PyPI artifact can't change with our
# workspace/ source. Splitting the two means each gets a narrow
# `paths:` filter that matches the inputs it actually depends on.
#
# Catches the failure mode where a PR adds an import requiring a newer
# SDK than `workspace/requirements.txt` pins:
# 1. Pip resolves the existing PyPI wheel + the old SDK pin -> smoke
# passes (it imports the OLD main.py from the wheel, not the PR's
# new main.py).
# 2. Merge -> publish-runtime.yml ships a wheel WITH the new import.
# 3. Tenant images redeploy -> all crash on first boot with ImportError.
on:
push:
branches: [main, staging]
pull_request:
branches: [main, staging]
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
concurrency:
# event_name + sha keeps PR sync and the subsequent staging push on the
# same SHA from cancelling each other (per feedback_concurrency_group_per_sha).
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.head.sha || github.sha }}
cancel-in-progress: true
jobs:
detect-changes:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
outputs:
wheel: ${{ steps.decide.outputs.wheel }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
- id: decide
run: |
# Inline replacement for dorny/paths-filter — same pattern
# PR#372's ci.yml port used. Diffs against the PR base or the
# previous push SHA, then matches against the wheel-relevant
# path set.
BASE="${GITHUB_BASE_REF:-${{ github.event.before }}}"
if [ "${{ github.event_name }}" = "pull_request" ] && [ -n "${{ github.event.pull_request.base.sha }}" ]; then
BASE="${{ github.event.pull_request.base.sha }}"
fi
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then
# New branch or no previous SHA: treat as wheel-relevant.
echo "wheel=true" >> "$GITHUB_OUTPUT"
exit 0
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
fi
if ! git cat-file -e "$BASE" 2>/dev/null; then
echo "wheel=true" >> "$GITHUB_OUTPUT"
exit 0
fi
CHANGED=$(git diff --name-only "$BASE" HEAD)
if echo "$CHANGED" | grep -qE '^(workspace/|scripts/build_runtime_package\.py$|scripts/wheel_smoke\.py$|\.gitea/workflows/runtime-prbuild-compat\.yml$)'; then
echo "wheel=true" >> "$GITHUB_OUTPUT"
else
echo "wheel=false" >> "$GITHUB_OUTPUT"
fi
# ONE job (no job-level `if:`) that always runs and reports under the
# required-check name `PR-built wheel + import smoke`. Real work is
# gated per-step on `needs.detect-changes.outputs.wheel`.
local-build-install:
needs: detect-changes
name: PR-built wheel + import smoke
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
steps:
- name: No-op pass (paths filter excluded this commit)
if: needs.detect-changes.outputs.wheel != 'true'
run: |
echo "No workspace/ / scripts/{build_runtime_package,wheel_smoke}.py / workflow changes — wheel gate satisfied without rebuilding."
echo "::notice::PR-built wheel + import smoke no-op pass (paths filter excluded this commit)."
- if: needs.detect-changes.outputs.wheel == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: needs.detect-changes.outputs.wheel == 'true'
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.11'
cache: pip
cache-dependency-path: workspace/requirements.txt
- name: Install build tooling
if: needs.detect-changes.outputs.wheel == 'true'
run: pip install build
- name: Build wheel from PR source (mirrors publish-runtime.yml)
if: needs.detect-changes.outputs.wheel == 'true'
# Use a fixed test version so the wheel filename is predictable.
# Doesn't reach PyPI — this build is local-only for the smoke.
run: |
python scripts/build_runtime_package.py \
--version "0.0.0.dev0+pin-compat" \
--out /tmp/runtime-build
cd /tmp/runtime-build && python -m build
- name: Install built wheel + workspace requirements
if: needs.detect-changes.outputs.wheel == 'true'
run: |
python -m venv /tmp/venv-built
/tmp/venv-built/bin/pip install --upgrade pip
/tmp/venv-built/bin/pip install /tmp/runtime-build/dist/*.whl
/tmp/venv-built/bin/pip install -r workspace/requirements.txt
/tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
| grep -E '^(Name|Version):'
- name: Smoke import the PR-built wheel
if: needs.detect-changes.outputs.wheel == 'true'
# Same script publish-runtime.yml runs against the to-be-PyPI wheel.
run: |
/tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"

View File

@ -1,70 +0,0 @@
name: SECRET_PATTERNS drift lint
# Ported from .github/workflows/secret-pattern-drift.yml on 2026-05-11
# per RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - on.paths references the new canonical .gitea/workflows/secret-scan.yml
# (the .github/ copy is removed by Cat A of this sweep).
# - CANONICAL_FILE inside scripts/lint_secret_pattern_drift.py was
# updated in the same Cat C-1 PR to point at .gitea/workflows/secret-scan.yml.
# - Workflow-level env.GITHUB_SERVER_URL set.
# - `continue-on-error: true` on the job (RFC §1 contract).
#
# Detects when the canonical SECRET_PATTERNS array in
# .gitea/workflows/secret-scan.yml diverges from known consumer
# mirrors (workspace-runtime's bundled pre-commit hook today; more
# can be added as the consumer set grows).
#
# Why this exists: every side that scans for credentials has its own
# copy of the pattern list. They drift — most recently the runtime
# hook lagged the canonical by one pattern (sk-cp- / MiniMax F1088),
# so a developer's local pre-commit would let a sk-cp- token through
# while the org-wide CI scan would refuse it. The cost of that drift
# is dev confusion + delayed feedback; the fix is automated detection.
#
# Triggers:
# - schedule: daily 05:00 UTC. Catches drift introduced by edits
# to a consumer copy that didn't update canonical here.
# - push to main/staging where the canonical or this lint changed:
# catches the inverse — canonical updated but consumers not yet
# bumped. The lint will fail the push; that's intentional.
on:
schedule:
# 05:00 UTC = 22:00 PT / 01:00 ET. Quiet hours so a failure
# email lands when humans are starting their day, not
# interrupting it.
- cron: "0 5 * * *"
push:
branches: [main, staging]
paths:
- ".gitea/workflows/secret-scan.yml"
- ".gitea/workflows/secret-pattern-drift.yml"
- ".github/scripts/lint_secret_pattern_drift.py"
- ".githooks/pre-commit"
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
# Auto-injected GITHUB_TOKEN scoped to read-only. The lint only does git
# checkout + HTTPS GETs to public consumer files; no writes to anything.
permissions:
contents: read
jobs:
lint:
name: Detect SECRET_PATTERNS drift
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
timeout-minutes: 5
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.11"
- name: Run drift lint
run: python3 .github/scripts/lint_secret_pattern_drift.py

View File

@ -12,31 +12,18 @@
# required_approving_reviews: 1
# approving_review_teams: ["ceo", "managers", "engineers"]
#
# Tier → required-team expression (internal#189 AND-composition):
# tier:low → engineers,managers,ceo (OR: any one suffices)
# tier:medium → managers AND engineers AND qa???,security??? (AND: all required)
# tier:high → ceo (OR: single team, wired for AND)
#
# "???" = teams not yet created in Gitea. When qa + security teams are
# added, update TIER_EXPR["tier:medium"] in the script to remove the
# markers. PRs already in-flight when qa/security are created continue
# to work because their authors explicitly requested those reviews.
# Tier → eligible-team mapping (mirror of dev-sop §SOP-6):
# tier:low → engineers, managers, ceo
# tier:medium → managers, ceo
# tier:high → ceo
#
# Force-merge: Owners-team override remains available out-of-band via
# the Gitea merge API; force-merge writes `incident.force_merge` to
# `structure_events` per §Persistent structured logging gate (Phase 3).
#
# Environment variables:
# SOP_DEBUG=1 — per-API-call diagnostic lines. Default: off.
# SOP_LEGACY_CHECK=1 — revert to OR-gate for this run. Grace window
# for PRs in-flight when AND-composition deployed.
# Burn-in: remove after 2026-05-17 (7-day window).
#
# BURN-IN NOTE (internal#189 Phase 1): continue-on-error: true is set on
# the tier-check job below. This prevents AND-composition from blocking
# PRs during the 7-day burn-in. After 2026-05-17:
# 1. Remove `continue-on-error: true` from this job block.
# 2. Update this BURN-IN NOTE comment to mark the window closed.
# Set `SOP_DEBUG: '1'` in the env block to enable per-API-call diagnostic
# lines — useful when diagnosing token-scope or team-id-resolution
# issues. Default off.
name: sop-tier-check
@ -63,9 +50,6 @@ on:
jobs:
tier-check:
runs-on: ubuntu-latest
# BURN-IN: continue-on-error prevents AND-composition from blocking
# PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
continue-on-error: true
permissions:
contents: read
pull-requests: read
@ -77,50 +61,21 @@ jobs:
# works if we never check out PR HEAD. Same SHA the workflow
# itself was loaded from.
ref: ${{ github.event.pull_request.base.sha }}
- name: Install jq
# Gitea Actions runners (ubuntu-latest label) do not bundle jq.
# The sop-tier-check script uses jq for all JSON API parsing.
# Install jq before the script runs so sop-tier-check can pass.
#
# Method: apt-get first (reliable for Ubuntu runners with internet
# access to package mirrors). Falls back to GitHub binary download.
# GitHub releases may be unreachable from some runner networks
# (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
# runners). The sop-tier-check script has its own fallback as a
# third line of defense. continue-on-error: true ensures this step
# failing does not block the job.
continue-on-error: true
run: |
# apt-get is the primary method — Ubuntu package mirrors are reliably
# reachable from runner containers. GitHub releases may be blocked
# or slow on some networks (infra#241 follow-up).
if apt-get update -qq && apt-get install -y -qq jq; then
echo "::notice::jq installed via apt-get: $(jq --version)"
elif timeout 120 curl -sSL \
"https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64" \
-o /usr/local/bin/jq && chmod +x /usr/local/bin/jq; then
echo "::notice::jq binary downloaded: $(/usr/local/bin/jq --version)"
else
echo "::warning::jq install failed — apt-get and GitHub download both failed."
fi
jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"
- name: Verify tier label + reviewer team membership
# continue-on-error: true at step level — job-level is ignored by Gitea
# Actions (quirk #10, internal runbooks). Belt-and-suspenders with
# SOP_FAIL_OPEN=1 + || true below.
continue-on-error: true
env:
# SOP_TIER_CHECK_TOKEN is the org-level secret for the
# sop-tier-bot PAT (read:organization,read:user,read:issue,
# read:repository). Stored at the org level
# (/api/v1/orgs/molecule-ai/actions/secrets) so per-repo
# configuration is unnecessary — every repo in the org
# picks it up automatically.
# Falls back to GITHUB_TOKEN with a clear error if missing.
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
GITEA_HOST: git.moleculesai.app
REPO: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
# Set to '1' for diagnostic per-API-call output. Off by default
# so production logs aren't noisy.
SOP_DEBUG: '0'
SOP_LEGACY_CHECK: '0'
# SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
# the actual merge gate. Combined with continue-on-error: true
# above, this step never fails the job regardless of script exit.
SOP_FAIL_OPEN: '1'
run: |
bash .gitea/scripts/sop-tier-check.sh || true
run: bash .gitea/scripts/sop-tier-check.sh

View File

@ -1,79 +0,0 @@
# sop-tier-refire — issue_comment-triggered refire of sop-tier-check.
#
# Closes internal#292. Gitea 1.22.6 doesn't refire workflows on the
# `pull_request_review` event (go-gitea/gitea#33700); the `sop-tier-check`
# workflow's review-event subscription is silently dead. The result:
# PRs that get their approving review AFTER the tier-check ran on open/
# synchronize keep their failing status check forever, and the only way
# to merge is the admin force-merge path (audited via `audit-force-merge`
# but the audit trail keeps growing; see `feedback_never_admin_merge_bypass`).
#
# Workaround pattern from `feedback_pull_request_review_no_refire`:
# `issue_comment` events DO fire reliably on 1.22.6. When a repo
# MEMBER/OWNER/COLLABORATOR comments `/refire-tier-check` on a PR, this
# workflow re-runs the sop-tier-check logic and POSTs the resulting
# status to the PR head SHA directly. No empty commit, no git history
# bloat, no cascade re-fire of every other workflow on the PR.
#
# SECURITY MODEL:
#
# 1. `pull_request` exists on the issue (issue_comment fires on issues
# AND PRs; we only want PRs).
# 2. `comment.author_association` must be MEMBER/OWNER/COLLABORATOR.
# Per the internal#292 core-security review (review#1066 ask): anyone
# can comment, but only repo collaborators+ can flip the status.
# Without this gate, a drive-by commenter on a public-issue-tracker
# surface could trigger a status flip.
# 3. Comment body must contain `/refire-tier-check` — a slash-command-
# shaped trigger (not just any comment word). Prevents accidental
# triggering from prose like "we should refire tests" in a review.
# 4. This workflow does NOT check out PR HEAD code. Like sop-tier-check,
# it only HTTP-calls the Gitea API. Trust boundary preserved.
#
# Note: `issue_comment` fires from the BASE branch's workflow file. There
# is no `pull_request_target` equivalent to set; the trigger inherently
# loads the workflow from the default branch.
#
# Rate-limit: a 1s pre-sleep + a "skip if status posted in last 30s"
# guard prevents comment-spam from thrashing the status. See the script.
name: sop-tier-check refire (issue_comment)
on:
issue_comment:
types: [created]
jobs:
refire:
# Three gates, all required:
# - comment is on a PR (not a plain issue)
# - commenter is MEMBER, OWNER, or COLLABORATOR
# - comment body contains the slash-command trigger
if: |
github.event.issue.pull_request != null &&
contains(fromJson('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association) &&
contains(github.event.comment.body, '/refire-tier-check')
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
statuses: write
steps:
- name: Check out base branch (for the script)
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
# Load the script from the default branch (main), matching the
# sop-tier-check.yml security model.
ref: ${{ github.event.repository.default_branch }}
- name: Re-evaluate sop-tier-check and POST status
env:
# Same org-level secret sop-tier-check.yml + audit-force-merge.yml use.
# Fallback to GITHUB_TOKEN with a clear error if missing.
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
GITEA_HOST: git.moleculesai.app
REPO: ${{ github.repository }}
PR_NUMBER: ${{ github.event.issue.number }}
COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
# Set to '1' for diagnostic per-API-call output. Off by default.
SOP_DEBUG: '0'
run: bash .gitea/scripts/sop-tier-refire.sh

View File

@ -1,346 +0,0 @@
name: Staging SaaS smoke (every 30 min)
# Renamed from canary-staging.yml on 2026-05-11 per Hongming directive
# ("canary naming changed to staging for all"). Originally ported from
# .github/workflows/canary-staging.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Minimum viable health check: provisions one Hermes workspace on a fresh
# staging org, sends one A2A message, verifies PONG, tears down. ~8 min
# wall clock. Pages on failure by opening a GitHub issue; auto-closes the
# issue on the next green run.
#
# The full-SaaS workflow (e2e-staging-saas.yml) covers the broader surface
# but runs only on provisioning-critical pushes + nightly — this one
# catches drift in the 30-min window between those runs (AMI health, CF
# cert rotation, WorkOS session stability, etc.).
#
# Lean mode: E2E_MODE=smoke skips the child workspace + HMA memory +
# peers/activity checks. One parent workspace + one A2A turn is enough
# to signal "SaaS stack end-to-end is alive."
on:
schedule:
# Every 30 min. Cron on GitHub-hosted runners has a known drift of
# a few minutes under load — that's fine for a smoke check.
- cron: '*/30 * * * *'
# Serialise with the full-SaaS workflow so they don't contend for the
# same org-create quota on staging. Different group key from
# e2e-staging-saas since we don't mind queueing smoke runs behind one
# full run, but two smoke runs SHOULD queue against each other.
concurrency:
group: staging-smoke
cancel-in-progress: false
permissions:
# Needed to open / close the alerting issue.
issues: write
contents: read
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
smoke:
name: Staging SaaS smoke
runs-on: ubuntu-latest
# NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
# 2026-05-11. The "surface broken workflows without blocking"
# rationale was correctly applied to advisory/lint workflows but
# wrong for this smoke — it is the 30-min canary cadence for the
# entire staging SaaS stack, and silent failure here masks the
# exact regressions the smoke exists to surface (AMI rot, CF cert
# drift, WorkOS session breakage, secret rotations). Same class of
# failure as PR#461 (`sweep-stale-e2e-orgs`) where Phase-3 silent
# failure leaked EC2. The four other `e2e-staging-*` workflows
# KEEP `continue-on-error: true` per RFC #219 §1 — they are
# advisory and matrix-style; this one is the canary. A follow-up
# `notify-failure` step below also surfaces breakage to ops even
# if branch-protection wiring is adjusted to keep this off the
# required-checks list.
# 25 min headroom over the 15-min TLS-readiness deadline in
# tests/e2e/test_staging_full_saas.sh (#2107). Without the buffer
# the job is killed at the wall-clock 15:00 mark BEFORE the bash
# `fail` + diagnostic burst can fire, leaving every cancellation
# silent. Sibling staging E2E jobs run at 20-45 min — keeping the
# smoke tighter than them so a true wedge still surfaces here
# first.
timeout-minutes: 25
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
# 2026-05-11: secret canonicalised from MOLECULE_STAGING_ADMIN_TOKEN
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
# MiniMax is the smoke's PRIMARY LLM auth path post-2026-05-04.
# Switched from hermes+OpenAI after #2578 (the staging OpenAI key
# account went over quota and stayed dead for 36+ hours, taking
# the smoke red the entire time). claude-code template's
# `minimax` provider routes ANTHROPIC_BASE_URL to
# api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot —
# ~5-10x cheaper per token than gpt-4.1-mini AND on a separate
# billing account, so OpenAI quota collapse no longer wedges the
# smoke. Mirrors the migration continuous-synth-e2e.yml made on
# 2026-05-03 (#265) for the same reason. tests/e2e/test_staging_
# full_saas.sh branches SECRETS_JSON on which key is present —
# MiniMax wins when set.
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
# Direct-Anthropic alternative for operators who don't want to
# set up a MiniMax account (priority below MiniMax — first
# non-empty wins in test_staging_full_saas.sh's secrets-injection
# block). See #2578 PR comment for the rationale.
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
# OpenAI fallback — kept wired so an operator-dispatched run with
# E2E_RUNTIME=hermes overridden via workflow_dispatch can still
# exercise the OpenAI path without re-editing the workflow.
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
E2E_MODE: smoke
E2E_RUNTIME: claude-code
# Pin the smoke to a specific MiniMax model rather than relying
# on the per-runtime default (which could resolve to "sonnet" →
# direct Anthropic and defeat the cost saving). M2.7-highspeed
# is "Token Plan only" but cheap-per-token and fast.
E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
E2E_RUN_ID: "smoke-${{ github.run_id }}"
# Debug-only: when an operator dispatches with keep_on_failure=true,
# the smoke script's E2E_KEEP_ORG=1 path skips teardown so the
# tenant org + EC2 stay alive for SSM-based log capture. Cron runs
# never set this (the input only exists on workflow_dispatch) so
# unattended cron always tears down. See molecule-core#129
# failure mode #1 — capturing the actual exception requires
# docker logs from the live container.
E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify admin token present
run: |
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
exit 2
fi
- name: Verify LLM key present
run: |
# Per-runtime key check — claude-code uses MiniMax; hermes /
# langgraph (operator-dispatched only) use OpenAI. Hard-fail
# rather than soft-skip per the lesson from synth E2E #2578:
# an empty key silently falls through to the wrong
# SECRETS_JSON branch and the smoke fails 5 min later with
# a confusing auth error instead of the clean "secret
# missing" message at the top.
case "${E2E_RUNTIME}" in
claude-code)
# Either MiniMax OR direct-Anthropic works — first
# non-empty wins in the test script's secrets-injection
# priority chain. Operators only need to set ONE of these
# secrets; we don't force a choice between them.
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
required_secret_value="${E2E_MINIMAX_API_KEY}"
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
required_secret_name="MOLECULE_STAGING_ANTHROPIC_API_KEY"
required_secret_value="${E2E_ANTHROPIC_API_KEY}"
else
required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY or MOLECULE_STAGING_ANTHROPIC_API_KEY"
required_secret_value=""
fi
;;
langgraph|hermes)
required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY"
required_secret_value="${E2E_OPENAI_API_KEY:-}"
;;
*)
echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
required_secret_name=""
required_secret_value="present"
;;
esac
if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — A2A will fail at request time with 'No LLM provider configured'"
exit 2
fi
echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
- name: Smoke run
id: smoke
run: bash tests/e2e/test_staging_full_saas.sh
# Alerting: open a sticky issue on the FIRST failure; comment on
# subsequent failures; auto-close on next green. Comment-on-existing
# de-duplicates so a single open issue accumulates the streak —
# ops sees one issue with N comments rather than N issues.
#
# Why no consecutive-failures threshold (e.g., wait 3 runs before
# filing): the prior threshold check used
# `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does
# not expose (returns 404). On Gitea Actions the threshold call
# ALWAYS failed, breaking the entire alerting step and going days
# silent on real regressions (38h+ chronic red on 2026-05-07/08
# before this fix; tracked in molecule-core#129). Filing on first
# failure is also better UX — we want to know about the first red,
# not wait 90 min for it to "count." Real flakes get one issue +
# a quick close-on-green; persistent reds accumulate comments.
- name: Open issue on failure (Gitea API)
if: failure()
env:
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
RUN_ID: ${{ github.run_id }}
run: |
set -euo pipefail
API="${SERVER_URL%/}/api/v1"
# Title kept stable across the canary-staging.yml → staging-smoke.yml
# rename (2026-05-11) so any open alert issue from the old name
# still title-matches and auto-closes on the next green run.
TITLE="Canary failing: staging SaaS smoke"
RUN_URL="${SERVER_URL}/${REPO}/actions/runs/${RUN_ID}"
EXISTING=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
"${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
| jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number' | head -1)
if [ -n "$EXISTING" ]; then
curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues/${EXISTING}/comments" \
-d "$(jq -nc --arg run "$RUN_URL" '{body: ("Smoke still failing. " + $run)}')" >/dev/null
echo "Commented on existing issue #${EXISTING}"
else
NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
BODY=$(jq -nc --arg t "$TITLE" --arg now "$NOW" --arg run "$RUN_URL" \
'{title: $t, body: ("Smoke run failed at " + $now + ".\n\nRun: " + $run + "\n\nThis issue auto-closes on the next green smoke run. Consecutive failures add a comment here rather than a new issue.")}')
curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues" -d "$BODY" >/dev/null
echo "Opened smoke failure issue (first red)"
fi
- name: Auto-close smoke issue on success (Gitea API)
if: success()
env:
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
SERVER_URL: ${{ env.GITHUB_SERVER_URL }}
RUN_ID: ${{ github.run_id }}
run: |
set -euo pipefail
API="${SERVER_URL%/}/api/v1"
# Title kept stable across the canary-staging.yml → staging-smoke.yml
# rename so open alert issues from the old name still match.
TITLE="Canary failing: staging SaaS smoke"
NUMS=$(curl -fsS -H "Authorization: token $GITEA_TOKEN" \
"${API}/repos/${REPO}/issues?state=open&type=issues&limit=50" \
| jq -r --arg t "$TITLE" '.[] | select(.title==$t) | .number')
NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
for N in $NUMS; do
curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues/${N}/comments" \
-d "$(jq -nc --arg now "$NOW" '{body: ("Smoke recovered at " + $now + ". Closing.")}')" >/dev/null
curl -fsS -X PATCH -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
"${API}/repos/${REPO}/issues/${N}" -d '{"state":"closed"}' >/dev/null
echo "Closed recovered smoke issue #${N}"
done
- name: Teardown safety net
if: always()
env:
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
run: |
set +e
# Slug prefix matches what test_staging_full_saas.sh emits
# in smoke mode:
# SLUG="e2e-smoke-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
# Earlier (pre-2026-05-11 canary→staging rename) the prefix was
# `e2e-canary-`; both prefixes are matched here for one
# release cycle so cleanup still catches any in-flight org
# provisioned under the old prefix on an older runner that
# hasn't picked up the renamed script. Remove the canary
# fallback after one week of no-old-prefix observations.
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
| python3 -c "
import json, sys, os, datetime
run_id = os.environ.get('GITHUB_RUN_ID', '')
d = json.load(sys.stdin)
# Scope to slugs from THIS smoke run when GITHUB_RUN_ID is
# available; the smoke workflow sets E2E_RUN_ID='smoke-\${run_id}'
# so the slug suffix is '-smoke-\${run_id}-...'. Mirrors the
# full-mode safety net's per-run scoping (e2e-staging-saas.yml)
# added after the 2026-04-21 cross-run cleanup incident.
# Sweep both today AND yesterday's UTC dates so a run that
# crosses midnight still cleans up its own slug — see the
# 2026-04-26→27 canvas-safety-net incident.
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
if run_id:
prefixes = tuple(f'e2e-smoke-{d}-smoke-{run_id}' for d in dates) \
+ tuple(f'e2e-canary-{d}-canary-{run_id}' for d in dates)
else:
prefixes = tuple(f'e2e-smoke-{d}-' for d in dates) \
+ tuple(f'e2e-canary-{d}-' for d in dates)
candidates = [o['slug'] for o in d.get('orgs', [])
if any(o.get('slug','').startswith(p) for p in prefixes)
and o.get('status') not in ('purged',)]
print('\n'.join(candidates))
" 2>/dev/null)
# Per-slug DELETE with HTTP-code verification. The previous
# `... >/dev/null || true` swallowed every failure, so a 5xx
# or timeout from CP looked identical to "successfully cleaned
# up" and the tenant kept eating ~2 vCPU until the hourly
# stale sweep caught it (up to 2h later). Now we capture the
# response code and surface non-2xx as a workflow warning, so
# the run page shows which slug leaked. We still don't `exit 1`
# on cleanup failure — a single-smoke cleanup miss shouldn't
# fail-flag the smoke itself when the actual smoke check
# passed. The sweep-stale-e2e-orgs cron (now every 15 min,
# 30-min threshold) is the safety net for whatever slips past.
# See molecule-controlplane#420.
leaks=()
for slug in $orgs; do
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
# pollution of the captured status (lint-curl-status-capture.yml).
set +e
curl -sS -o /tmp/smoke-cleanup.out -w "%{http_code}" \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/smoke-cleanup.code
set -e
code=$(cat /tmp/smoke-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
echo "[teardown] deleted $slug (HTTP $code)"
else
echo "::warning::smoke teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/smoke-cleanup.out 2>/dev/null)"
leaks+=("$slug")
fi
done
if [ ${#leaks[@]} -gt 0 ]; then
echo "::warning::smoke teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
fi
exit 0
- name: Notify on smoke failure
# Fail-loud companion to dropping `continue-on-error: true`.
# The Open-issue-on-failure step above handles the human-facing
# alert; this step emits a clearly-tagged ::error:: line that
# log-tail consumers (Loki SOPRefireRule, orchestrator triage
# loop) can grep on. Mirrors PR#461's sweep-stale-e2e-orgs
# pattern. Runs AFTER the teardown safety net (which is
# if: always()) so failures don't suppress cleanup.
if: failure()
run: |
echo "::error::staging-smoke FAILED — staging SaaS canary is red. See prior step logs + the auto-filed alert issue. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) MiniMax/Anthropic LLM key dead, (d) AMI/CF/WorkOS drift. The 30-min cron will retry, but a chronic red here indicates the staging SaaS stack is broken end-to-end."
exit 1

View File

@ -1,288 +0,0 @@
name: Staging verify
# Renamed from canary-verify.yml on 2026-05-11 per Hongming directive
# ("canary naming changed to staging for all"). Originally ported from
# .github/workflows/canary-verify.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
# - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support
# for the `workflow_run` event is partial. If this never fires on a
# real publish-workspace-server-image completion, the follow-up
# triage PR should replace the trigger with a push-with-paths-filter
# on the same publish workflow's path (i.e. `.gitea/workflows/publish-workspace-server-image.yml`).
#
# Runs the canary smoke suite against the staging canary tenant fleet
# after a new :staging-<sha> image lands in ECR. On green, calls the
# CP redeploy-fleet endpoint to promote :staging-<sha> → :latest so
# the prod tenant fleet's 5-minute auto-updater picks up the verified
# digest. On red, :latest stays on the prior known-good digest and
# prod is untouched.
#
# Terminology note (2026-05-11): The deployment STRATEGY here is still
# called "canary release" (a small subset of tenants gets the new image
# first, the rest follow on green). The "canary" word stays for the
# pre-fan-out cohort concept (see docs/architecture/canary-release.md
# and CANARY_SLUG in redeploy-tenants-on-*.yml). What changed is the
# FILE NAME and the SECRETS feeding this workflow — both are renamed
# to drop the redundant "canary-" prefix that conflated workflow
# identity with deployment strategy.
#
# Registry note (2026-05-10): This workflow previously used GHCR
# (ghcr.io/molecule-ai/platform-tenant) — that registry was retired
# during the 2026-05-06 Gitea suspension migration when publish-
# workspace-server-image.yml switched to the operator's ECR org
# (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/
# platform-tenant). The GHCR → ECR migration was never applied to
# this file, so this workflow was silently smoke-testing the stale
# GHCR image while the actual staging/prod tenants ran the ECR image.
# Result: smoke tests could not catch a broken ECR build. Fix:
# - Wait step: reads SHA from running canary /health (tenant-
# agnostic, works regardless of registry).
# - Promote step: calls CP redeploy-fleet endpoint with target_tag=
# staging-<sha>, same mechanism as redeploy-tenants-on-main.yml.
# No longer attempts GHCR crane ops.
#
# Dependencies:
# - publish-workspace-server-image.yml publishes :staging-<sha>
# to ECR on staging and main merges.
# - Canary tenants are configured to pull :staging-<sha> from ECR
# (TENANT_IMAGE env set to the ECR :staging-<sha> tag).
# - Repo secrets MOLECULE_STAGING_TENANT_URLS /
# MOLECULE_STAGING_ADMIN_TOKENS / MOLECULE_STAGING_CP_SHARED_SECRET
# are populated.
on:
workflow_run:
workflows: ["publish-workspace-server-image"]
types: [completed]
permissions:
contents: read
packages: write
actions: read
env:
# ECR registry (post-2026-05-06 SSOT for tenant images).
# publish-workspace-server-image.yml pushes here.
IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
# CP endpoint for redeploy-fleet (used in promote step below).
CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
staging-smoke:
# Skip when the upstream workflow failed — no image to test against.
# workflow_dispatch trigger dropped in this Gitea port; only the
# workflow_run path remains.
if: ${{ github.event.workflow_run.conclusion == 'success' }}
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
outputs:
sha: ${{ steps.compute.outputs.sha }}
smoke_ran: ${{ steps.smoke.outputs.ran }}
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Compute sha
id: compute
run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
- name: Wait for canary tenants to pick up :staging-<sha>
# Poll canary health endpoints every 30s for up to 7 min instead
# of a fixed 6-min sleep. Exits as soon as ALL canaries report
# the new SHA (~2-3 min typical vs 6 min fixed). Falls back to
# proceeding after 7 min even if not all canaries responded —
# the smoke suite will catch any that didn't update.
#
# NOTE: The SHA is read from the running tenant's /health response,
# NOT from a registry lookup. This is registry-agnostic and works
# regardless of whether the tenant pulls from ECR, GHCR, or any
# other registry — the canary is telling us what it's actually
# running, which is the ground truth for smoke testing.
env:
MOLECULE_STAGING_TENANT_URLS: ${{ secrets.MOLECULE_STAGING_TENANT_URLS }}
EXPECTED_SHA: ${{ steps.compute.outputs.sha }}
run: |
if [ -z "$MOLECULE_STAGING_TENANT_URLS" ]; then
echo "No canary URLs configured — falling back to 60s wait"
sleep 60
exit 0
fi
IFS=',' read -ra URLS <<< "$MOLECULE_STAGING_TENANT_URLS"
MAX_WAIT=420 # 7 minutes
INTERVAL=30
ELAPSED=0
while [ $ELAPSED -lt $MAX_WAIT ]; do
ALL_READY=true
for url in "${URLS[@]}"; do
HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}")
SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4)
if [ "$SHA" != "$EXPECTED_SHA" ]; then
ALL_READY=false
break
fi
done
if $ALL_READY; then
echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s"
exit 0
fi
echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)"
sleep $INTERVAL
ELAPSED=$((ELAPSED + INTERVAL))
done
echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)"
- name: Run staging smoke suite
id: smoke
# Graceful-skip when no canary fleet is configured (Phase 2 not yet
# stood up — see molecule-controlplane/docs/canary-tenants.md).
# Sets `ran=false` on skip so promote-to-latest stays off (we don't
# want every main merge auto-promoting without gating). Manual
# promote-latest.yml is the release gate while canary is absent.
# Once the fleet is real: delete the early-exit branch.
env:
MOLECULE_STAGING_TENANT_URLS: ${{ secrets.MOLECULE_STAGING_TENANT_URLS }}
MOLECULE_STAGING_ADMIN_TOKENS: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKENS }}
MOLECULE_STAGING_CP_BASE_URL: https://staging-api.moleculesai.app
MOLECULE_STAGING_CP_SHARED_SECRET: ${{ secrets.MOLECULE_STAGING_CP_SHARED_SECRET }}
run: |
set -euo pipefail
if [ -z "${MOLECULE_STAGING_TENANT_URLS:-}" ] \
|| [ -z "${MOLECULE_STAGING_ADMIN_TOKENS:-}" ] \
|| [ -z "${MOLECULE_STAGING_CP_SHARED_SECRET:-}" ]; then
{
echo "## ⚠️ staging-verify skipped"
echo
echo "One or more canary secrets are unset (\`MOLECULE_STAGING_TENANT_URLS\`, \`MOLECULE_STAGING_ADMIN_TOKENS\`, \`MOLECULE_STAGING_CP_SHARED_SECRET\`)."
echo "Phase 2 canary fleet has not been stood up yet —"
echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
echo
echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready."
} >> "$GITHUB_STEP_SUMMARY"
echo "ran=false" >> "$GITHUB_OUTPUT"
echo "::notice::staging-verify: skipped — no canary fleet configured"
exit 0
fi
bash scripts/staging-smoke.sh
echo "ran=true" >> "$GITHUB_OUTPUT"
- name: Summary on failure
if: ${{ failure() }}
run: |
{
echo "## Canary smoke FAILED"
echo
echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`."
echo ":latest stays pinned to the prior good digest — prod is untouched."
echo
echo "Fix forward and merge again, or investigate the specific failed"
echo "assertions in the staging-smoke step log above."
} >> "$GITHUB_STEP_SUMMARY"
promote-to-latest:
# On green, calls the CP redeploy-fleet endpoint with target_tag=
# staging-<sha> to promote the verified ECR image. This is the same
# mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops.
#
# Pre-fix history: the old GHCR promote step used `crane tag` against
# ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server-
# image.yml had already migrated to ECR on 2026-05-07 (commit
# 10e510f5). The GHCR tags were never updated, so this step was
# silently promoting a stale GHCR image while actual prod tenants
# pulled from ECR. Canary smoke tests were GHCR-targeted and could
# not catch a broken ECR build.
needs: staging-smoke
if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
env:
SHA: ${{ needs.staging-smoke.outputs.sha }}
CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
# CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint.
# Stored at the repo level so all workflows pick it up automatically.
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
# canary_slug pin: deploy the verified :staging-<sha> to the canary
# first (soak 120s), then fan out to the rest of the fleet.
CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }}
SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }}
BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }}
steps:
- name: Check CP credentials
run: |
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet."
echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret."
exit 1
fi
- name: Promote verified ECR image to :latest
run: |
set -euo pipefail
TARGET_TAG="staging-${SHA}"
BODY=$(jq -nc \
--arg tag "$TARGET_TAG" \
--argjson soak "${SOAK_SECONDS:-120}" \
--argjson batch "${BATCH_SIZE:-3}" \
--argjson dry false \
'{
target_tag: $tag,
soak_seconds: $soak,
batch_size: $batch,
dry_run: $dry
}')
if [ -n "${CANARY_SLUG:-}" ]; then
BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY")
fi
echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " target_tag: $TARGET_TAG"
echo " body: $BODY"
HTTP_RESPONSE=$(mktemp)
HTTP_CODE_FILE=$(mktemp)
set +e
curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" >"$HTTP_CODE_FILE"
CURL_EXIT=$?
set -e
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)"
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
if [ "$HTTP_CODE" -ge 400 ]; then
echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed."
exit 1
fi
- name: Summary
run: |
{
echo "## Staging verified — :latest promoted via CP redeploy-fleet"
echo ""
echo "- **Target tag:** \`staging-${{ needs.staging-smoke.outputs.sha }}\`"
echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)"
echo "- **Canary slug:** \`${CANARY_SLUG:-<none>}\` (soak ${SOAK_SECONDS}s)"
echo "- **Batch size:** ${BATCH_SIZE:-3}"
echo ""
echo "CP redeploy-fleet is rolling out the verified image across the prod fleet."
echo "The fleet's 5-minute health-check loop will pick up the update automatically."
} >> "$GITHUB_STEP_SUMMARY"

View File

@ -1,129 +0,0 @@
name: Sweep stale AWS Secrets Manager secrets
# Ported from .github/workflows/sweep-aws-secrets.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Janitor for per-tenant AWS Secrets Manager secrets
# (`molecule/tenant/<org_id>/bootstrap`) whose backing tenant no
# longer exists. Parallel-shape to sweep-cf-tunnels.yml and
# sweep-cf-orphans.yml — different cloud, same justification.
#
# Why this exists separately from a long-term reconciler integration:
# - molecule-controlplane's tenant_resources audit table (mig 024)
# currently tracks four resource kinds: CloudflareTunnel,
# CloudflareDNS, EC2Instance, SecurityGroup. SecretsManager is
# not in the list, so the existing reconciler doesn't catch
# orphan secrets.
# - At ~$0.40/secret/month the cost grew to ~$19/month before this
# sweeper was written, indicating ~45+ orphan secrets from
# crashed provisions and incomplete deprovision flows.
# - The proper fix (KindSecretsManagerSecret + recorder hook +
# reconciler enumerator) is filed as a separate controlplane
# issue. This sweeper is the immediate cost-relief stopgap.
#
# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID /
# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same
# credentials used by the rest of the platform. The dedicated
# AWS_JANITOR_* naming (which the original GitHub workflow used) was
# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID /
# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have
# secretsmanager:ListSecrets (the production molecule-cp principal);
# if ListSecrets is revoked in future, a dedicated janitor principal
# would need to be created and the Gitea secret names updated here.
#
# Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
# sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
# the mostly-orphan tunnels) refuses to nuke past the threshold.
on:
schedule:
# Hourly at :30 — offsets from sweep-cf-orphans (:15) and
# sweep-cf-tunnels (:45) so the three janitors don't burst the
# CP admin endpoints at the same minute.
- cron: '30 * * * *'
# Don't let two sweeps race the same AWS account.
concurrency:
group: sweep-aws-secrets
cancel-in-progress: false
permissions:
contents: read
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
sweep:
name: Sweep AWS Secrets Manager
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
# 30 min cap, mirroring the other janitors. AWS DeleteSecret is
# fast (~0.3s/call) so even a 100+ backlog drains in seconds
# under the 8-way xargs parallelism, but the cap is set generously
# to leave headroom for any actual API hang.
timeout-minutes: 30
env:
AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify required secrets present
id: verify
# Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
# and sweep-cf-tunnels (hardened 2026-04-28). Same principle:
# - schedule → exit 1 on missing secrets (red CI surfaces it)
# - workflow_dispatch → exit 0 with warning (operator-driven,
# they already accepted the repo state)
run: |
missing=()
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do
if [ -z "${!var:-}" ]; then
missing+=("$var")
fi
done
if [ ${#missing[@]} -gt 0 ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
exit 1
fi
echo "All required secrets present ✓"
echo "skip=false" >> "$GITHUB_OUTPUT"
- name: Run sweep
if: steps.verify.outputs.skip != 'true'
# Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-tunnels:
# - Scheduled: input empty → "false" → --execute (the whole
# point of an hourly janitor).
# - Manual workflow_dispatch: input default true → dry-run;
# operator must flip it to actually delete.
run: |
set -euo pipefail
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
echo "Running in dry-run mode — no deletions"
bash scripts/ops/sweep-aws-secrets.sh
else
echo "Running with --execute — will delete identified orphans"
bash scripts/ops/sweep-aws-secrets.sh --execute
fi

View File

@ -1,156 +0,0 @@
name: Sweep stale Cloudflare DNS records
# Ported from .github/workflows/sweep-cf-orphans.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Janitor for Cloudflare DNS records whose backing tenant/workspace no
# longer exists. Without this loop, every short-lived E2E or canary
# leaves a CF record on the moleculesai.app zone — the zone has a
# 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
# start failing with code 81045 once exhausted.
#
# Why a separate workflow vs sweep-stale-e2e-orgs.yml:
# - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
# drives the cascade). It assumes CP has the org row to drive the
# deprovision from. It doesn't catch records left behind when CP
# itself never knew about the tenant (canary scratch, manual ops
# experiments) or when the cascade's CF-delete branch failed.
# - sweep-cf-orphans.sh enumerates the CF zone directly and matches
# each record against live CP slugs + AWS EC2 names. It catches
# leaks the CP-driven sweep can't.
#
# Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
# than 50% of records in a single run. If something has gone weird
# (CP admin endpoint returns no orgs → every tenant looks orphan) the
# gate halts before damage. Decision-function unit tests in
# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
# classifier.
#
# Secrets: CF_API_TOKEN, CF_ZONE_ID, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
# are confirmed existing per issue #425 §425 audit. CP_ADMIN_API_TOKEN and
# CP_STAGING_ADMIN_API_TOKEN are unconfirmed — if missing, the verify step
# (schedule → hard-fail, dispatch → soft-skip) surfaces it clearly.
on:
schedule:
# Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
# converge on the same tick. CF API rate budget is generous (1200
# req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
- cron: '15 * * * *' # offset from sweep-stale-e2e-orgs (top of hour)
# No `merge_group:` trigger on purpose. This is a janitor — it doesn't
# need to gate merges, and including it as written before #2088 fired
# the full sweep job (or its secret-check) on every PR going through
# the merge queue, generating one red CI run per merge-queue eval. If
# this workflow is ever wired up as a required check, re-add
# merge_group: { types: [checks_requested] }
# AND gate the sweep step with `if: github.event_name != 'merge_group'`
# so merge-queue evals report success without actually running.
# Don't let two sweeps race the same zone. workflow_dispatch during a
# scheduled run would otherwise issue duplicate DELETE calls.
concurrency:
group: sweep-cf-orphans
cancel-in-progress: false
permissions:
contents: read
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
sweep:
name: Sweep CF orphans
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
# 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
# within one cron interval instead of burning a full tick. Realistic
# worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
# each individually capped at 10s by the script's curl -m flag.
timeout-minutes: 3
env:
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify required secrets present
id: verify
# Schedule-vs-dispatch behaviour split (hardened 2026-04-28
# after the silent-no-op incident below):
#
# The earlier soft-skip-on-schedule policy hid a real leak. All
# six secrets were unset on this repo for an unknown duration;
# every hourly run printed a yellow ::warning:: and exited 0,
# so the workflow registered as "passing" while doing nothing.
# CF orphans accumulated to 152/200 (~76% of the zone quota
# gone) before a manual `dig`-driven audit caught it. Anything
# that runs as a janitor and reports green while idle is
# indistinguishable from "the janitor is healthy" — so we now
# treat schedule (and any future workflow_run/push triggers)
# as a hard-fail when secrets are missing.
#
# - schedule / workflow_run / push → exit 1 (red CI run
# surfaces the misconfiguration the next tick)
# - workflow_dispatch → exit 0 with a warning
# (an operator ran this ad-hoc; they already accepted the
# state of the repo and want the workflow to short-circuit
# so they can rerun after fixing the secret)
run: |
missing=()
for var in CF_API_TOKEN CF_ZONE_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
if [ -z "${!var:-}" ]; then
missing+=("$var")
fi
done
if [ ${#missing[@]} -gt 0 ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
echo "::error::a silent skip masked an active CF DNS leak (152/200 zone records) caught only by a manual audit on 2026-04-28; this gate exists to make the gap visible."
exit 1
fi
echo "All required secrets present ✓"
echo "skip=false" >> "$GITHUB_OUTPUT"
- name: Run sweep
if: steps.verify.outputs.skip != 'true'
# Schedule-vs-dispatch dry-run asymmetry (intentional):
# - Scheduled runs: github.event.inputs.dry_run is empty →
# defaults to "false" below → script runs with --execute
# (the whole point of an hourly janitor).
# - Manual workflow_dispatch: input default is true (line 38)
# so an ad-hoc operator-triggered run is dry-run by default;
# they have to flip the toggle to actually delete.
# The script's MAX_DELETE_PCT gate (default 50%) is the second
# line of defense regardless of mode.
run: |
set -euo pipefail
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
echo "Running in dry-run mode — no deletions"
bash scripts/ops/sweep-cf-orphans.sh
else
echo "Running with --execute — will delete identified orphans"
bash scripts/ops/sweep-cf-orphans.sh --execute
fi

View File

@ -1,133 +0,0 @@
name: Sweep stale Cloudflare Tunnels
# Ported from .github/workflows/sweep-cf-tunnels.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Janitor for Cloudflare Tunnels whose backing tenant no longer
# exists. Parallel-shape to sweep-cf-orphans.yml (which sweeps DNS
# records); same justification, different CF resource.
#
# Why this exists separately from sweep-cf-orphans:
# - DNS records live on the zone (`/zones/<id>/dns_records`).
# - Tunnels live on the account (`/accounts/<id>/cfd_tunnel`).
# - Different CF API surface, different scopes; the existing CF
# token might not have `account:cloudflare_tunnel:edit`. Splitting
# the workflows keeps each one's secret-presence gate independent
# so neither silent-skips when the other's secret is missing.
# - Cleaner blast radius — operators can disable one without the
# other if a regression surfaces.
#
# Safety: the script's MAX_DELETE_PCT gate (default 90% — higher than
# the DNS sweep's 50% because tenant-shaped tunnels are mostly
# orphans by design) refuses to nuke past the threshold.
#
# Secrets: CF_API_TOKEN, CF_ACCOUNT_ID are confirmed existing per
# issue #425 §425 audit. CP_ADMIN_API_TOKEN and CP_STAGING_ADMIN_API_TOKEN
# are unconfirmed — if missing, the verify step (schedule → hard-fail,
# dispatch → soft-skip) surfaces it clearly.
on:
schedule:
# Hourly at :45 — offset from sweep-cf-orphans (:15) so the two
# janitors don't issue parallel CF API bursts at the same minute.
- cron: '45 * * * *'
# Don't let two sweeps race the same account.
concurrency:
group: sweep-cf-tunnels
cancel-in-progress: false
permissions:
contents: read
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
sweep:
name: Sweep CF tunnels
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
# 30 min cap. Was 5 min on the theory that the only thing that
# could take >5min is a CF-API hang — but on 2026-05-02 a backlog
# of 672 stale tunnels accumulated (large staging E2E run + delayed
# sweep) and the serial `curl -X DELETE` loop (~0.7s/tunnel) needed
# ~7-8min to drain. The 5-min cap killed the run mid-sweep
# (cancelled at 424/672, see run 25248788312); a manual rerun
# finished the remainder fine.
#
# The fix is two-part: parallelize the delete loop (8-way xargs in
# the script — see scripts/ops/sweep-cf-tunnels.sh), AND raise the
# cap so a one-off backlog doesn't trip a hangs-detector that
# turned out to be a real-job-too-slow detector. With 8-way
# parallelism, 600+ tunnels drains in ~60s; 30 min is generous
# headroom for actual hangs to still surface (and is in line with
# the sweep-cf-orphans companion job).
timeout-minutes: 30
env:
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify required secrets present
id: verify
# Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
# (hardened 2026-04-28 after the silent-no-op incident: the
# janitor reported green while doing nothing because secrets
# were unset, masking a 152/200 zone-record leak). Same
# principle applies here:
# - schedule → exit 1 on missing secrets (red CI surfaces it)
# - workflow_dispatch → exit 0 with warning (operator-driven,
# they already accepted the repo state)
run: |
missing=()
for var in CF_API_TOKEN CF_ACCOUNT_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do
if [ -z "${!var:-}" ]; then
missing+=("$var")
fi
done
if [ ${#missing[@]} -gt 0 ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
echo "::warning::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from the zone:dns:edit scope used by sweep-cf-orphans)."
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
echo "::error::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope."
exit 1
fi
echo "All required secrets present ✓"
echo "skip=false" >> "$GITHUB_OUTPUT"
- name: Run sweep
if: steps.verify.outputs.skip != 'true'
# Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-orphans:
# - Scheduled: input empty → "false" → --execute (the whole
# point of an hourly janitor).
# - Manual workflow_dispatch: input default true → dry-run;
# operator must flip it to actually delete.
run: |
set -euo pipefail
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
echo "Running in dry-run mode — no deletions"
bash scripts/ops/sweep-cf-tunnels.sh
else
echo "Running with --execute — will delete identified orphans"
bash scripts/ops/sweep-cf-tunnels.sh --execute
fi

View File

@ -1,267 +0,0 @@
name: Sweep stale e2e-* orgs (staging)
# Ported from .github/workflows/sweep-stale-e2e-orgs.yml on 2026-05-11 per RFC
# internal#219 §1 sweep. Differences from the GitHub version:
# - Dropped `workflow_dispatch.inputs` (Gitea 1.22.6 parser rejects them
# per feedback_gitea_workflow_dispatch_inputs_unsupported).
# - Dropped `merge_group:` (no Gitea merge queue).
# - Dropped `environment:` blocks (Gitea has no environments).
# - Workflow-level env.GITHUB_SERVER_URL pinned per
# feedback_act_runner_github_server_url.
# - `continue-on-error: true` on each job (RFC §1 contract).
#
# Janitor for staging tenants left behind when E2E cleanup didn't run:
# CI cancellations, runner crashes, transient AWS errors mid-cascade,
# bash trap missed (signal 9), etc. Without this loop, every failed
# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
#
# Why not rely on per-test-run teardown:
# - Per-run teardown is best-effort by definition. Any process death
# after the test starts but before the trap fires leaves debris.
# - GH Actions cancellation kills the runner without grace period.
# The workflow's `if: always()` step usually catches this, but it
# too can fail (CP transient 5xx, runner network issue at the
# wrong moment).
# - Even when teardown runs, the CP cascade is best-effort in places
# (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
# - This sweep is the catch-all that converges staging back to clean
# regardless of which specific path leaked.
#
# The PROPER fix is making CP cleanup transactional + verify-after-
# terminate (filed separately as cleanup-correctness work). This
# workflow is the safety net that catches everything else AND any
# future leak source we haven't yet identified.
on:
schedule:
# Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from
# create to teardown — canary is ~8 min, full SaaS ~25 min). The
# previous hourly + 120-min stale threshold meant a leaked tenant
# could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per
# leak. Tightening the cadence + threshold reduces the worst-case
# leak window from 120 min to ~45 min (15-min sweep cadence + 30-min
# threshold) without risk of catching in-progress runs (the longest
# e2e run is the 25-min canary, well under the 30-min threshold).
# See molecule-controlplane#420 for the leak-class accounting that
# motivated this tightening.
- cron: '*/15 * * * *'
# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
# on a manual trigger; queue rather than parallel-delete.
concurrency:
group: sweep-stale-e2e-orgs
cancel-in-progress: false
permissions:
contents: read
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
sweep:
name: Sweep e2e orgs
runs-on: ubuntu-latest
# NOTE: Phase 3 (RFC #219 §1) `continue-on-error: true` removed
# 2026-05-11. The "surface broken workflows without blocking"
# rationale was correctly applied to advisory/lint workflows but
# wrong for this janitor — silent failure here masks real-money
# tenant leaks. Hongming observed 15 leaked EC2 in molecule-canary
# (004947743811) us-east-2 at 11:05Z 2026-05-11 because the sweep
# had been exiting 2 every tick and the failure was swallowed.
# See `feedback_strict_root_only_after_class_a` — critical janitors
# must fail loud. A follow-up `notify-failure` step below also
# surfaces breakage to ops even if branch-protection wiring is
# adjusted to keep this off the required-checks list.
timeout-minutes: 15
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }}
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
# Refuse to delete more than this many orgs in one tick. If the
# CP DB is briefly empty (or the admin endpoint goes weird and
# returns no created_at), every e2e- org would look stale.
# Bailing protects against runaway nukes.
SAFETY_CAP: 50
steps:
- name: Verify admin token present
run: |
if [ -z "$ADMIN_TOKEN" ]; then
echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
exit 2
fi
echo "Admin token present ✓"
- name: Identify stale e2e orgs
id: identify
run: |
set -euo pipefail
# Fetch into a file so the python step reads it via stdin —
# cleaner than embedding $(curl ...) into a heredoc.
curl -sS --fail-with-body --max-time 30 \
"$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
> orgs.json
# Filter:
# 1. slug starts with one of the ephemeral test prefixes:
# - 'e2e-' — covers e2e-smoke- (formerly e2e-canary-),
# e2e-canvas-*, etc.
# - 'rt-e2e-' — runtime-test harness fixtures (RFC #2251);
# missing this prefix left two such tenants
# orphaned 8h on staging (2026-05-03), then
# hard-failed redeploy-tenants-on-staging
# and broke the staging→main auto-promote
# chain. Kept in sync with the EPHEMERAL_PREFIX_RE
# regex in redeploy-tenants-on-staging.yml.
# 2. created_at is older than MAX_AGE_MINUTES ago
# Output one slug per line to a file the next step reads.
python3 > stale_slugs.txt <<'PY'
import json, os
from datetime import datetime, timezone, timedelta
# SSOT for this list lives in the controlplane Go code:
# molecule-controlplane/internal/slugs/ephemeral.go
# (var EphemeralPrefixes). The redeploy-fleet auto-rollout
# also reads from there to SKIP these slugs — without that
# filter, fleet redeploy SSM-failed in-flight E2E tenants
# whose containers were still booting, breaking the test
# that just spun them up (molecule-controlplane#493).
# Update both files together.
EPHEMERAL_PREFIXES = ("e2e-", "rt-e2e-")
with open("orgs.json") as f:
data = json.load(f)
max_age = int(os.environ["MAX_AGE_MINUTES"])
cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
for o in data.get("orgs", []):
slug = o.get("slug", "")
if not slug.startswith(EPHEMERAL_PREFIXES):
continue
created = o.get("created_at")
if not created:
# Defensively skip rows without created_at — better
# to leave one orphan than nuke a brand-new row
# whose timestamp didn't render.
continue
# Python 3.11+ handles RFC3339 with Z directly via
# fromisoformat; older runners need the trailing Z swap.
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
if created_dt < cutoff:
print(slug)
PY
count=$(wc -l < stale_slugs.txt | tr -d ' ')
echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
if [ "$count" -gt 0 ]; then
echo "First 20:"
head -20 stale_slugs.txt | sed 's/^/ /'
fi
echo "count=$count" >> "$GITHUB_OUTPUT"
- name: Safety gate
if: steps.identify.outputs.count != '0'
run: |
count="${{ steps.identify.outputs.count }}"
if [ "$count" -gt "$SAFETY_CAP" ]; then
echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
exit 1
fi
echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
- name: Delete stale orgs
if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
run: |
set -uo pipefail
deleted=0
failed=0
while IFS= read -r slug; do
[ -z "$slug" ] && continue
# The DELETE handler requires {"confirm": "<slug>"} matching
# the URL slug — fat-finger guard. Idempotent: re-issuing
# picks up via org_purges.last_step.
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
# pollution of the captured status (lint-curl-status-capture.yml).
set +e
curl -sS -o /tmp/del_resp -w "%{http_code}" \
--max-time 60 \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/del_code
set -e
# Stderr from curl (-sS shows dial errors etc.) goes to runner log.
http_code=$(cat /tmp/del_code 2>/dev/null || echo "000")
if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
deleted=$((deleted+1))
echo " deleted: $slug"
else
failed=$((failed+1))
echo " FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
fi
done < stale_slugs.txt
echo ""
echo "Sweep summary: deleted=$deleted failed=$failed"
# Don't fail the workflow on per-org delete errors — the
# sweeper is best-effort. Next hourly tick re-attempts. We
# only fail loud at the safety-cap gate above.
- name: Sweep orphan tunnels
# Stale-org cleanup deletes the org (which cascades to tunnel
# delete inside the CP). But when that cascade fails partway —
# CP transient 5xx after the org row is deleted but before the
# CF tunnel delete completes — the tunnel persists with no
# matching org row. The reconciler in internal/sweep flags this
# as `cf_tunnel kind=orphan`, but nothing automatically reaps it.
#
# `/cp/admin/orphan-tunnels/cleanup` is the operator-triggered
# reaper. Calling it here at the end of every sweep tick
# converges the staging CF account to clean even when CP
# cascades half-fail.
#
# PR #492 made the underlying DeleteTunnel actually check
# status — pre-fix it silent-succeeded on CF code 1022
# ("active connections"), so this step would have been a no-op
# against stuck connectors. Post-fix the cleanup invokes
# CleanupTunnelConnections + retry, which actually clears the
# 1022 case. (#2987)
#
# Best-effort. Failure here doesn't fail the workflow — next
# tick re-attempts. Errors flow to step output for ops review.
if: env.DRY_RUN != 'true'
run: |
set +e
curl -sS -o /tmp/cleanup_resp -w "%{http_code}" \
--max-time 60 \
-X POST "$MOLECULE_CP_URL/cp/admin/orphan-tunnels/cleanup" \
-H "Authorization: Bearer $ADMIN_TOKEN" >/tmp/cleanup_code
set -e
http_code=$(cat /tmp/cleanup_code 2>/dev/null || echo "000")
body=$(cat /tmp/cleanup_resp 2>/dev/null | head -c 500)
if [ "$http_code" = "200" ]; then
count=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(d.get('deleted_count', 0))" 2>/dev/null || echo "0")
failed_n=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(len(d.get('failed') or {}))" 2>/dev/null || echo "0")
echo "Orphan-tunnel sweep: deleted=$count failed=$failed_n"
else
echo "::warning::orphan-tunnels cleanup returned HTTP $http_code — body: $body"
fi
- name: Dry-run summary
if: env.DRY_RUN == 'true'
run: |
echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete."
- name: Notify on sweep failure
# Fail-loud companion to dropping `continue-on-error: true`.
# If any prior step failed (missing token, CP 5xx, safety-cap
# tripped, etc.) emit a clearly-tagged ::error:: line so the
# Gitea runs UI + any log-tail consumer (Loki SOPRefireRule)
# flags this. Without this step, an early `exit 2` shows as a
# red run but the message can scroll past in busy log windows;
# the explicit tag here is greppable from the orchestrator
# triage loop.
if: failure()
run: |
echo "::error::sweep-stale-e2e-orgs FAILED — staging tenants are LEAKING. See prior step logs. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) safety-cap tripped (CP admin API returning malformed orgs). Manual cleanup of leaked EC2 + DNS may be required while this is broken."
exit 1

View File

@ -1,65 +0,0 @@
name: Ops Scripts Tests
# Ported from .github/workflows/test-ops-scripts.yml on 2026-05-11 per
# RFC internal#219 §1 sweep.
#
# Differences from the GitHub version:
# - Dropped `merge_group:` trigger (no Gitea merge queue).
# - on.paths references .gitea/workflows/test-ops-scripts.yml (this
# file) instead of the .github/ one.
# - Workflow-level env.GITHUB_SERVER_URL set.
# - `continue-on-error: true` on the job (RFC §1 contract).
#
# Runs the unittest suite for scripts/ on every PR + push that touches
# anything under scripts/. Kept separate from the main CI so a script-only
# change doesn't trigger the heavier Go/Canvas/Python pipelines.
#
# Discovery layout: tests sit alongside the code they test (see
# scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/
# test_build_runtime_package.py for the rewriter coverage). The job
# below runs `unittest discover` TWICE — once from `scripts/`, once
# from `scripts/ops/` — because neither dir has an `__init__.py`, so
# a single discover from `scripts/` doesn't recurse into the ops
# subdir. Two passes is simpler than retrofitting namespace packages.
on:
push:
branches: [main, staging]
paths:
- 'scripts/**'
- '.gitea/workflows/test-ops-scripts.yml'
pull_request:
branches: [main, staging]
paths:
- 'scripts/**'
- '.gitea/workflows/test-ops-scripts.yml'
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
test:
name: Ops scripts (unittest)
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
continue-on-error: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.11'
- name: Run scripts/ unittests (build_runtime_package, ...)
# Top-level scripts/ tests live alongside their target file
# (e.g. scripts/test_build_runtime_package.py exercises
# scripts/build_runtime_package.py). discover from scripts/
# picks up only top-level test_*.py because scripts/ops/ has
# no __init__.py — that's intentional, so we run two passes.
working-directory: scripts
run: python -m unittest discover -t . -p 'test_*.py' -v
- name: Run scripts/ops/ unittests (sweep_cf_decide, ...)
working-directory: scripts/ops
run: python -m unittest discover -p 'test_*.py' -v

View File

@ -28,7 +28,7 @@ import sys
import urllib.request
from pathlib import Path
CANONICAL_FILE = Path(".gitea/workflows/secret-scan.yml")
CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
# Public consumer mirrors. Each entry is (label, raw_url) — raw_url
# points at the file's RAW content on the consumer's default branch

138
.github/workflows/auto-tag-runtime.yml vendored Normal file
View File

@ -0,0 +1,138 @@
name: auto-tag-runtime
# Auto-tag runtime releases on every merge to main that touches workspace/.
# This is the entry point of the runtime CD chain:
#
# merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template
# image rebuilds → repull on hosts.
#
# Default bump is patch. Override via PR label `release:minor` or
# `release:major` BEFORE merging — the label is read off the merged PR
# associated with the push commit.
#
# Skips when:
# - The push isn't to main (other branches don't auto-release).
# - The merge commit message contains `[skip-release]` (escape hatch
# for cleanup PRs that touch workspace/ but shouldn't ship).
on:
push:
branches: [main]
paths:
- "workspace/**"
- "scripts/build_runtime_package.py"
- ".github/workflows/auto-tag-runtime.yml"
- ".github/workflows/publish-runtime.yml"
permissions:
contents: write # to push the new tag
pull-requests: read # to read labels off the merged PR
concurrency:
# Serialize tag bumps so two near-simultaneous merges can't both think
# they're 0.1.6 and race to push the same tag.
group: auto-tag-runtime
cancel-in-progress: false
jobs:
tag:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0 # need full tag history for `git describe` / sort
- name: Skip when commit asks
id: skip
run: |
MSG=$(git log -1 --format=%B "${{ github.sha }}")
if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then
echo "skip=true" >> "$GITHUB_OUTPUT"
echo "Commit message contains [skip-release] — no tag will be created."
else
echo "skip=false" >> "$GITHUB_OUTPUT"
fi
- name: Determine bump kind from PR label
id: bump
if: steps.skip.outputs.skip != 'true'
env:
# Gitea-shape token (act_runner forwards GITHUB_TOKEN as a
# short-lived per-run secret with read access to this repo).
# We hit `/api/v1/repos/.../pulls?state=closed` directly
# because `gh pr list` calls Gitea's GraphQL endpoint, which
# returns HTTP 405 (issue #75 / post-#66 sweep).
GITEA_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
GITEA_API_URL: ${{ github.server_url }}/api/v1
PUSH_SHA: ${{ github.sha }}
run: |
# Find the merged PR whose merge_commit_sha matches this push.
# Gitea's `/repos/{owner}/{repo}/pulls?state=closed` returns
# PRs sorted newest-first; we paginate up to 50 and jq-filter
# on `merge_commit_sha == PUSH_SHA`. Bounded — auto-tag fires
# per push to main, so the matching PR is always among the
# most recent closures. 50 is comfortably more than the
# ~10-20 staging→main promotes that close in any reasonable
# window.
set -euo pipefail
PRS_JSON=$(curl --fail-with-body -sS \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Accept: application/json" \
"${GITEA_API_URL}/repos/${REPO}/pulls?state=closed&sort=newest&limit=50" \
2>/dev/null || echo "[]")
PR=$(printf '%s' "$PRS_JSON" \
| jq -c --arg sha "$PUSH_SHA" \
'[.[] | select(.merged_at != null and .merge_commit_sha == $sha)] | .[0] // empty')
if [ -z "$PR" ] || [ "$PR" = "null" ]; then
echo "No merged PR found for ${PUSH_SHA} — defaulting to patch bump."
echo "kind=patch" >> "$GITHUB_OUTPUT"
exit 0
fi
# Gitea returns labels under `.labels[].name`, same shape as
# GitHub's REST. The previous `gh pr list --json number,labels`
# output was identical; jq filter unchanged.
LABELS=$(printf '%s' "$PR" | jq -r '.labels[]?.name // empty')
if echo "$LABELS" | grep -qx 'release:major'; then
echo "kind=major" >> "$GITHUB_OUTPUT"
elif echo "$LABELS" | grep -qx 'release:minor'; then
echo "kind=minor" >> "$GITHUB_OUTPUT"
else
echo "kind=patch" >> "$GITHUB_OUTPUT"
fi
- name: Compute next version from latest runtime-v* tag
id: version
if: steps.skip.outputs.skip != 'true'
run: |
# Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver
# ordering; `grep` filters to the right tag prefix.
LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1)
if [ -z "$LATEST" ]; then
# No prior tag — start the runtime line at 0.1.0.
CURRENT="0.0.0"
else
CURRENT="${LATEST#runtime-v}"
fi
MAJOR=$(echo "$CURRENT" | cut -d. -f1)
MINOR=$(echo "$CURRENT" | cut -d. -f2)
PATCH=$(echo "$CURRENT" | cut -d. -f3)
case "${{ steps.bump.outputs.kind }}" in
major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;;
minor) MINOR=$((MINOR+1)); PATCH=0;;
patch) PATCH=$((PATCH+1));;
esac
NEW="$MAJOR.$MINOR.$PATCH"
echo "current=$CURRENT" >> "$GITHUB_OUTPUT"
echo "new=$NEW" >> "$GITHUB_OUTPUT"
echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})"
- name: Push new tag
if: steps.skip.outputs.skip != 'true'
run: |
NEW_TAG="runtime-v${{ steps.version.outputs.new }}"
git config user.name "github-actions[bot]"
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})"
git push origin "$NEW_TAG"
echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag."

View File

@ -0,0 +1,111 @@
name: branch-protection drift check
# Catches out-of-band edits to branch protection (UI clicks, manual gh
# api PATCH from a one-off ops session) by comparing live state against
# tools/branch-protection/apply.sh's desired state every day. Fails the
# workflow when they drift; the failure is the signal.
#
# When it fails: re-run apply.sh to put the live state back to the
# script's intent, OR update apply.sh to encode the new intent and
# commit. Either way the script is the source of truth.
on:
schedule:
# 14:00 UTC daily. Off-hours for most teams; gives a fresh signal
# at the start of every working day.
- cron: '0 14 * * *'
workflow_dispatch:
pull_request:
branches: [staging, main]
paths:
- 'tools/branch-protection/**'
- '.github/workflows/**'
- '.github/workflows/branch-protection-drift.yml'
permissions:
contents: read
jobs:
drift:
name: Branch protection drift
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
# Token strategy by trigger:
#
# - schedule (daily canary): hard-fail when the admin token is
# missing. This is the *only* trigger where silent soft-skip is
# dangerous — a missing secret on the cron run means the drift
# gate has effectively disappeared with no human in the loop to
# notice. Per feedback_schedule_vs_dispatch_secrets_hardening.md
# the rule is "schedule/automated triggers must hard-fail".
#
# - pull_request (touching tools/branch-protection/**): soft-skip
# with a prominent warning. A PR cannot retroactively drift the
# live state — drift happens *between* PRs (UI clicks, manual
# gh api PATCH) and is the schedule's job to catch. The PR-time
# gate would only catch typos in apply.sh, which the apply.sh
# *_payload unit tests catch better. A human is reviewing the
# PR and will see the warning in the workflow log.
#
# - workflow_dispatch (operator one-off): soft-skip with warning,
# so an operator can run a diagnostic without configuring the
# secret first.
- name: Verify admin token present (hard-fail on schedule only)
env:
GH_TOKEN_FOR_ADMIN_API: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
run: |
if [[ -n "$GH_TOKEN_FOR_ADMIN_API" ]]; then
echo "GH_TOKEN_FOR_ADMIN_API present — drift_check will run with admin scope."
exit 0
fi
if [[ "${{ github.event_name }}" == "schedule" ]]; then
echo "::error::GH_TOKEN_FOR_ADMIN_API secret missing on the daily canary." >&2
echo "" >&2
echo "The schedule run is the SoT for branch-protection drift detection." >&2
echo "Without admin scope it silently passes, hiding any out-of-band edits." >&2
echo "Set GH_TOKEN_FOR_ADMIN_API at Settings → Secrets and variables → Actions." >&2
exit 1
fi
echo "::warning::GH_TOKEN_FOR_ADMIN_API secret missing — drift_check will be SKIPPED."
echo "::warning::PR drift checks need repo-admin scope to read /branches/:b/protection."
echo "::warning::This is non-fatal: the daily schedule run is the canonical drift gate."
echo "SKIP_DRIFT_CHECK=1" >> "$GITHUB_ENV"
- name: Run drift check
if: env.SKIP_DRIFT_CHECK != '1'
env:
# Repo-admin scope, needed for /branches/:b/protection.
GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
run: bash tools/branch-protection/drift_check.sh
# Self-test the parity script before running it on the real
# workflows — pins the script's classification logic against
# synthetic safe/unsafe/missing/unsafe-mix/matrix fixtures so a
# regression in the script can't false-pass on the production
# workflow audit. Cheap (~0.5s); always runs.
- name: Self-test check-name parity script
run: bash tools/branch-protection/test_check_name_parity.sh
# Check-name parity gate (#144 / saved memory
# feedback_branch_protection_check_name_parity).
#
# drift_check.sh asserts the live branch protection matches what
# apply.sh would set; check_name_parity.sh closes the orthogonal
# gap: it asserts every required check name in apply.sh maps to a
# workflow job whose "always emits this status" shape is intact.
#
# The two checks fail in different scenarios:
#
# - drift_check fails → live state was rewritten out-of-band
# (UI click, manual PATCH).
# - check_name_parity fails → an apply.sh required name has no
# emitter, OR the emitting workflow has a top-level paths:
# filter without per-step if-gates (the silent-block shape).
#
# Cheap (~1s); runs without the admin token because it only reads
# apply.sh + .github/workflows/ from the checkout.
- name: Run check-name parity gate
run: bash tools/branch-protection/check_name_parity.sh

View File

@ -1,34 +1,19 @@
name: canary-verify
# Runs the canary smoke suite against the staging canary tenant fleet
# after a new :staging-<sha> image lands in ECR. On green, calls the
# CP redeploy-fleet endpoint to promote :staging-<sha> → :latest so
# the prod tenant fleet's 5-minute auto-updater picks up the verified
# digest. On red, :latest stays on the prior known-good digest and
# prod is untouched.
#
# Registry note (2026-05-10): This workflow previously used GHCR
# (ghcr.io/molecule-ai/platform-tenant) — that registry was retired
# during the 2026-05-06 Gitea suspension migration when publish-
# workspace-server-image.yml switched to the operator's ECR org
# (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/
# platform-tenant). The GHCR → ECR migration was never applied to
# this file, so canary-verify was silently smoke-testing the stale
# GHCR image while the actual staging/prod tenants ran the ECR image.
# Result: smoke tests could not catch a broken ECR build. Fix:
# - Wait step: reads SHA from running canary /health (tenant-
# agnostic, works regardless of registry).
# - Promote step: calls CP redeploy-fleet endpoint with target_tag=
# staging-<sha>, same mechanism as redeploy-tenants-on-main.yml.
# No longer attempts GHCR crane ops.
# after a new :staging-<sha> image lands in GHCR. On green, promotes
# :staging-<sha> → :latest so the prod tenant fleet's 5-minute
# auto-updater picks up the verified digest. On red, :latest stays
# on the prior known-good digest and prod is untouched.
#
# Dependencies:
# - publish-workspace-server-image.yml publishes :staging-<sha>
# to ECR on staging and main merges.
# - Canary tenants are configured to pull :staging-<sha> from ECR
# (TENANT_IMAGE env set to the ECR :staging-<sha> tag).
# (NOT :latest) on main merge
# - canary tenants are configured to pull :staging-<sha> as their
# tenant image (set TENANT_IMAGE=ghcr.io/…:staging-<sha> on the
# canary provisioner code path OR rotate via an admin endpoint)
# - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS /
# CANARY_CP_SHARED_SECRET are populated.
# CANARY_CP_SHARED_SECRET are populated
on:
workflow_run:
@ -42,12 +27,8 @@ permissions:
actions: read
env:
# ECR registry (post-2026-05-06 SSOT for tenant images).
# publish-workspace-server-image.yml pushes here.
IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
# CP endpoint for redeploy-fleet (used in promote step below).
CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
IMAGE_NAME: ghcr.io/molecule-ai/platform
TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
jobs:
canary-smoke:
@ -71,12 +52,6 @@ jobs:
# the new SHA (~2-3 min typical vs 6 min fixed). Falls back to
# proceeding after 7 min even if not all canaries responded —
# the smoke suite will catch any that didn't update.
#
# NOTE: The SHA is read from the running tenant's /health response,
# NOT from a registry lookup. This is registry-agnostic and works
# regardless of whether the tenant pulls from ECR, GHCR, or any
# other registry — the canary is telling us what it's actually
# running, which is the ground truth for smoke testing.
env:
CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
EXPECTED_SHA: ${{ steps.compute.outputs.sha }}
@ -158,98 +133,42 @@ jobs:
} >> "$GITHUB_STEP_SUMMARY"
promote-to-latest:
# On green, calls the CP redeploy-fleet endpoint with target_tag=
# staging-<sha> to promote the verified ECR image. This is the same
# mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops.
#
# Pre-fix history: the old GHCR promote step used `crane tag` against
# ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server-
# image.yml had already migrated to ECR on 2026-05-07 (commit
# 10e510f5). The GHCR tags were never updated, so this step was
# silently promoting a stale GHCR image while actual prod tenants
# pulled from ECR. Canary smoke tests were GHCR-targeted and could
# not catch a broken ECR build.
# On green, retag :staging-<sha> → :latest for BOTH images.
# crane is a lightweight registry client (no Docker daemon needed on
# the runner) that can retag remotely with a single API call each.
# Gated on smoke_ran=true — without a real canary fleet the smoke
# step no-ops with success, and we don't want that to silently
# auto-promote every main merge.
needs: canary-smoke
if: ${{ needs.canary-smoke.result == 'success' && needs.canary-smoke.outputs.smoke_ran == 'true' }}
runs-on: ubuntu-latest
env:
SHA: ${{ needs.canary-smoke.outputs.sha }}
CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }}
# CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint.
# Stored at the repo level so all workflows pick it up automatically.
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
# canary_slug pin: deploy the verified :staging-<sha> to the canary
# first (soak 120s), then fan out to the rest of the fleet.
CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }}
SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }}
BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }}
steps:
- name: Check CP credentials
- uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
- name: GHCR login
run: |
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet."
echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret."
exit 1
fi
echo "${{ secrets.GITHUB_TOKEN }}" | \
crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
- name: Promote verified ECR image to :latest
- name: Retag platform :staging-<sha> → :latest
run: |
set -euo pipefail
crane tag \
"${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
latest
TARGET_TAG="staging-${SHA}"
BODY=$(jq -nc \
--arg tag "$TARGET_TAG" \
--argjson soak "${SOAK_SECONDS:-120}" \
--argjson batch "${BATCH_SIZE:-3}" \
--argjson dry false \
'{
target_tag: $tag,
soak_seconds: $soak,
batch_size: $batch,
dry_run: $dry
}')
if [ -n "${CANARY_SLUG:-}" ]; then
BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY")
fi
echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " target_tag: $TARGET_TAG"
echo " body: $BODY"
HTTP_RESPONSE=$(mktemp)
HTTP_CODE_FILE=$(mktemp)
set +e
curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" >"$HTTP_CODE_FILE"
CURL_EXIT=$?
set -e
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)"
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
if [ "$HTTP_CODE" -ge 400 ]; then
echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed."
exit 1
fi
- name: Retag tenant :staging-<sha> → :latest
run: |
crane tag \
"${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
latest
- name: Summary
run: |
{
echo "## Canary verified — :latest promoted via CP redeploy-fleet"
echo ""
echo "- **Target tag:** \`staging-${{ needs.canary-smoke.outputs.sha }}\`"
echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)"
echo "- **Canary slug:** \`${CANARY_SLUG:-<none>}\` (soak ${SOAK_SECONDS}s)"
echo "- **Batch size:** ${BATCH_SIZE:-3}"
echo ""
echo "CP redeploy-fleet is rolling out the verified image across the prod fleet."
echo "The fleet's 5-minute health-check loop will pick up the update automatically."
echo "## Canary verified — :latest promoted"
echo
echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`"
echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`"
echo
echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle."
} >> "$GITHUB_STEP_SUMMARY"

View File

@ -0,0 +1,123 @@
name: Check merge_group trigger on required workflows
# Pre-merge guard against the deadlock pattern where a workflow whose
# check is in `required_status_checks` lacks a `merge_group:` trigger.
# Without it, GitHub merge queue stalls forever in AWAITING_CHECKS
# because the required check can't fire on `gh-readonly-queue/...` refs.
#
# This workflow:
# 1. Lists required status checks on the branch protection rule for `staging`
# 2. For each required check, finds the workflow that produces it (by job
# name match)
# 3. Fails if any such workflow lacks `merge_group:` in its triggers
#
# Reasoning for staging-only: main has its own CI gating model (PR review),
# but staging is what the merge queue runs on, so it's the trigger that
# matters.
on:
pull_request:
paths:
- '.github/workflows/**.yml'
- '.github/workflows/**.yaml'
push:
branches: [staging, main]
paths:
- '.github/workflows/**.yml'
- '.github/workflows/**.yaml'
# Self-listen on merge_group so the linter passes its own queue run.
merge_group:
types: [checks_requested]
jobs:
check:
name: Required workflows have merge_group trigger
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify merge_group trigger on required-check workflows
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
shell: bash
run: |
set -euo pipefail
# Branch we care about — the one merge queue runs on.
BRANCH=staging
# Pull the list of required status check contexts. If the branch
# has no protection or no required checks, exit clean — nothing
# to lint.
REQUIRED=$(gh api "repos/${REPO}/branches/${BRANCH}/protection/required_status_checks" \
--jq '.contexts[]' 2>/dev/null || true)
if [ -z "$REQUIRED" ]; then
echo "No required status checks on ${BRANCH} — nothing to verify."
exit 0
fi
echo "Required checks on ${BRANCH}:"
echo "${REQUIRED}" | sed 's/^/ - /'
echo
# Build a map: workflow file -> set of job names declared in it.
# We use yq if available, otherwise grep the `name:` lines under
# `jobs:`. Stick with grep for portability — runner image always
# has it; yq isn't in the default image as of 2026-04.
declare -A workflow_jobs
shopt -s nullglob
for wf in .github/workflows/*.yml .github/workflows/*.yaml; do
[ -f "$wf" ] || continue
# Extract the workflow name (the `name:` at file root).
wf_name=$(awk '/^name:[[:space:]]/ {sub(/^name:[[:space:]]+/,""); gsub(/^"|"$/,""); print; exit}' "$wf")
# Extract job step names from the `jobs:` block. A job step is:
# - id under `jobs:` (key with 2-space indent followed by colon)
# - the `name:` field inside that job (4-space indent)
# We collect both because required_status_checks contexts can
# match either, depending on how the workflow was authored.
jobs_block=$(awk '/^jobs:/{flag=1; next} flag' "$wf")
job_names=$(echo "$jobs_block" | awk '/^[[:space:]]{4}name:[[:space:]]/ {sub(/^[[:space:]]+name:[[:space:]]+/,""); gsub(/^["'"'"']|["'"'"']$/,""); print}')
workflow_jobs["$wf"]="${wf_name}"$'\n'"${job_names}"
done
# For each required check, find the workflow that produces it.
# Then verify that workflow lists merge_group as a trigger.
FAILED=0
while IFS= read -r check; do
[ -z "$check" ] && continue
owning_wf=""
for wf in "${!workflow_jobs[@]}"; do
if echo "${workflow_jobs[$wf]}" | grep -Fxq "$check"; then
owning_wf="$wf"
break
fi
done
if [ -z "$owning_wf" ]; then
echo "::warning::Required check '${check}' has no matching workflow in this repo. Skipping (may be from an external app)."
continue
fi
# Does the workflow's trigger list include merge_group?
# Match either bare `merge_group:` line or merge_group with
# subsequent indented config (types: [checks_requested]).
if grep -qE '^[[:space:]]*merge_group:' "$owning_wf"; then
echo "OK: '${check}' (in $owning_wf) — has merge_group trigger"
else
echo "::error file=${owning_wf}::Required check '${check}' is produced by ${owning_wf}, but the workflow does not declare a 'merge_group:' trigger. With merge queue enabled on ${BRANCH}, this will deadlock the queue (every PR sits AWAITING_CHECKS forever). Add this to the workflow's 'on:' block:"
echo "::error file=${owning_wf}:: merge_group:"
echo "::error file=${owning_wf}:: types: [checks_requested]"
FAILED=1
fi
done <<< "$REQUIRED"
if [ "$FAILED" -ne 0 ]; then
echo
echo "::error::Block. See errors above. Reference: $(grep -l 'reference_merge_queue' /dev/null 2>/dev/null || echo 'memory: reference_merge_queue_enablement.md')."
exit 1
fi
echo
echo "All required workflows on ${BRANCH} declare merge_group triggers."

View File

@ -304,9 +304,13 @@ jobs:
needs: [changes, canvas-build]
# Only fires on direct pushes to main (i.e. after staging→main promotion).
if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main'
permissions:
# Required to post commit comments via the GitHub API.
contents: write
steps:
- name: Write deploy reminder to step summary
- name: Post deploy reminder as commit comment
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMMIT_SHA: ${{ github.sha }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
@ -333,13 +337,10 @@ jobs:
printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
"$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
# Gitea has no commit-comments API (no equivalent of
# POST /repos/{owner}/{repo}/commits/{commit_sha}/comments).
# Write to GITHUB_STEP_SUMMARY instead — both GitHub Actions and
# Gitea Actions render this as the workflow run's summary page,
# which is where operators look for post-deploy action items.
# (#75 / PR-D)
cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
gh api \
--method POST \
"repos/${{ github.repository }}/commits/${{ github.sha }}/comments" \
--field "body=@/tmp/deploy-reminder.md"
# Python Lint & Test — required check, always runs. See platform-build
# for the rationale.
@ -365,7 +366,7 @@ jobs:
cache: pip
cache-dependency-path: workspace/requirements.txt
- if: needs.changes.outputs.python == 'true'
run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov sqlalchemy>=2.0.0
run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov
# Coverage flags + fail-under floor moved into workspace/pytest.ini
# (issue #1817) so local `pytest` and CI use identical config.
- if: needs.changes.outputs.python == 'true'

136
.github/workflows/codeql.yml vendored Normal file
View File

@ -0,0 +1,136 @@
name: CodeQL
# Stub workflow — CodeQL Action is structurally incompatible with Gitea
# Actions (post-2026-05-06 SCM migration off GitHub).
#
# Why this is a stub, not a real CodeQL run:
#
# 1. github/codeql-action/init@v4 hits api.github.com endpoints
# (CodeQL CLI bundle download + query-pack registry + telemetry)
# that Gitea 1.22.x does NOT proxy. The act_runner has
# GITHUB_SERVER_URL=https://git.moleculesai.app correctly set
# (per saved memory feedback_act_runner_github_server_url and
# /config.yaml on the operator host), but the Gitea API surface
# simply does not implement the codeql-action bundle endpoints.
# Observed in run 1d/3101 (2026-05-07): "::error::404 page not
# found" inside the Initialize CodeQL step, before any analysis.
#
# 2. PR #35 attempted to mark `continue-on-error: true` at the JOB
# level (correct YAML structure). Gitea 1.22.6 does NOT propagate
# job-level continue-on-error to the commit-status API — every
# matrix leg still posts `failure` to the status surface, which
# keeps OVERALL=failure on every push to main + staging and
# blocks visual auto-promote signals (#156).
#
# 3. Hongming policy decision (2026-05-07, task #156): CodeQL is
# ADVISORY, not blocking, on Gitea Actions. We do not block PR
# merge or staging→main promotion on CodeQL findings until we
# have a Gitea-compatible static-analysis pipeline.
#
# What this stub preserves:
#
# - Workflow name `CodeQL` (referenced by auto-promote-staging.yml
# line 67 as a workflow_run gate — must stay stable).
# - Job name template `Analyze (${{ matrix.language }})` and the
# 3-leg matrix (go, javascript-typescript, python). Branch
# protection / required-check parity (#144) keys on these
# exact context names.
# - merge_group + push + pull_request + schedule triggers, so the
# merge-queue check name still resolves (per saved memory
# feedback_branch_protection_check_name_parity).
#
# Re-enabling real analysis (future work):
#
# - Option A: self-hosted Semgrep / OpenGrep via a custom action
# that doesn't hit api.github.com. Tracked behind #156 follow-up.
# - Option B: Sonatype Nexus IQ or similar, called from a step
# that uses the Gitea-issued token only.
# - Option C: re-host this workflow on a small GitHub mirror used
# ONLY for SAST (push-mirrored from Gitea). Acceptable trade-off
# if/when payment is restored on a non-suspended GitHub org —
# but per saved memory feedback_no_single_source_of_truth, we
# should design for multi-vendor backup, not GitHub-only SAST.
#
# Until one of those lands, this stub keeps commit-status green so
# the auto-promote chain isn't permanently red on a tool we cannot
# actually run.
#
# Security policy: ADVISORY. We accept the residual risk of un-scanned
# pushes during this window. Compensating controls in place:
# - secret-scan.yml runs on every push (active, blocks on hits)
# - block-internal-paths.yml blocks forbidden file paths
# - lint-curl-status-capture.yml catches one specific class of bug
# - branch-protection-drift.yml + the merge_group required-checks
# parity keep the gate surface stable
# These are not equivalent to CodeQL coverage. Status of the
# replacement plan is tracked in #156.
on:
push:
branches: [main, staging]
pull_request:
branches: [main, staging]
# Required so the matrix legs emit a real result on the queued
# commit instead of a false-green when merge queue is enabled.
# Per saved memory feedback_branch_protection_check_name_parity:
# path-filtered / matrix workflows MUST emit the protected name
# via a job that always runs.
merge_group:
types: [checks_requested]
schedule:
# Weekly heartbeat. Cheap on a stub (the no-op job is ~5s) but
# keeps the workflow visible in Gitea's Actions UI so the next
# operator notices it's a stub instead of a missing surface.
- cron: '30 1 * * 0'
# Workflow-level concurrency: only one stub run per branch/PR at a
# time. cancel-in-progress: false because a quick follow-up push
# shouldn't kill an in-flight run — even though the stub is fast,
# the contract should match a real CodeQL run for when we re-enable.
concurrency:
group: codeql-${{ github.ref }}
cancel-in-progress: false
permissions:
actions: read
contents: read
# No security-events: write — we don't call the upload API anyway,
# GHAS isn't on Gitea.
jobs:
analyze:
# Job NAME shape is load-bearing — auto-promote-staging.yml +
# branch protection both key on `Analyze (${{ matrix.language }})`.
# Do NOT rename without coordinating both surfaces.
name: Analyze (${{ matrix.language }})
runs-on: ubuntu-latest
timeout-minutes: 5
strategy:
fail-fast: false
matrix:
language: [go, javascript-typescript, python]
steps:
# Single-step stub: log the policy decision + emit success.
# Exit 0 explicitly so the commit-status API records `success`
# for each of the three matrix legs.
- name: CodeQL stub (advisory, non-blocking on Gitea)
shell: bash
run: |
set -euo pipefail
cat <<EOF
CodeQL is currently ADVISORY on Gitea Actions (post-2026-05-06).
Language matrix leg: ${{ matrix.language }}
Reason: github/codeql-action/init@v4 calls api.github.com
bundle endpoints that Gitea 1.22.x does not implement.
Observed: "::error::404 page not found" in the Init
CodeQL step on every prior run.
Policy: per Hongming decision 2026-05-07 (#156), CodeQL is
non-blocking until a Gitea-compatible SAST pipeline
lands. See workflow file header for replacement
options + compensating controls.
Status: emitting success so auto-promote isn't permanently
red on a tool we cannot actually run today.
EOF
echo "::notice::CodeQL ${{ matrix.language }} — advisory stub, success."

View File

@ -51,7 +51,7 @@ name: E2E API Smoke Test
# * Pre-pull `alpine:latest` so the platform-server's provisioner
# (`internal/handlers/container_files.go`) can stand up its
# ephemeral token-write helper without a daemon.io round-trip.
# * Create `molecule-core-net` bridge network if missing so the
# * Create `molecule-monorepo-net` bridge network if missing so the
# provisioner's container.HostConfig {NetworkMode: ...} attach
# succeeds.
# Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
@ -163,12 +163,12 @@ jobs:
# when the image is already present.
docker pull alpine:latest >/dev/null
# Provisioner attaches workspace containers to
# molecule-core-net (workspace-server/internal/provisioner/
# molecule-monorepo-net (workspace-server/internal/provisioner/
# provisioner.go::DefaultNetwork). The bridge already exists on
# the operator host's docker daemon — `network create` is
# idempotent via `|| true`.
docker network create molecule-core-net >/dev/null 2>&1 || true
echo "alpine:latest pre-pulled; molecule-core-net ensured."
docker network create molecule-monorepo-net >/dev/null 2>&1 || true
echo "alpine:latest pre-pulled; molecule-monorepo-net ensured."
- name: Start Postgres (docker)
if: needs.detect-changes.outputs.api == 'true'
run: |

View File

@ -34,7 +34,7 @@ name: Handlers Postgres Integration
# So we sidestep `services:` entirely. The job container still uses
# host-net (inherited from runner config; required for cache server
# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
# postgres on the existing `molecule-core-net` bridge with a
# postgres on the existing `molecule-monorepo-net` bridge with a
# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
# read its bridge IP via `docker inspect`. A host-net job container
# can reach a bridge-net container directly via the bridge IP (verified
@ -44,7 +44,7 @@ name: Handlers Postgres Integration
# + No host-port collision; N parallel runs share the bridge cleanly
# + `if: always()` cleanup runs even on test-step failure
# - One more step in the workflow (+~3 lines)
# - Requires `molecule-core-net` to exist on the operator host
# - Requires `molecule-monorepo-net` to exist on the operator host
# (it does; declared in docker-compose.yml + docker-compose.infra.yml)
#
# Class B Hongming-owned CICD red sweep, 2026-05-08.
@ -96,7 +96,7 @@ jobs:
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
# Bridge network already exists on the operator host (declared
# in docker-compose.yml + docker-compose.infra.yml).
PG_NETWORK: molecule-core-net
PG_NETWORK: molecule-monorepo-net
defaults:
run:
working-directory: workspace-server

View File

@ -56,40 +56,21 @@ jobs:
run: ${{ steps.decide.outputs.run }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
id: filter
with:
filters: |
run:
- 'workspace-server/**'
- 'canvas/**'
- 'tests/harness/**'
- '.github/workflows/harness-replays.yml'
- id: decide
run: |
# workflow_dispatch: always run (manual trigger)
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "run=true" >> "$GITHUB_OUTPUT"
echo "debug=manual-trigger" >> "$GITHUB_OUTPUT"
exit 0
fi
# Determine the base commit to diff against.
# For pull_request: use base.sha (the merge-base with main/staging).
# For push: use github.event.before (the previous tip of the branch).
# Fallback for new branches (all-zeros SHA): run everything.
if [ "${{ github.event_name }}" = "pull_request" ] && \
[ -n "${{ github.event.pull_request.base.sha }}" ]; then
BASE="${{ github.event.pull_request.base.sha }}"
elif [ -n "${{ github.event.before }}" ] && \
! echo "${{ github.event.before }}" | grep -qE '^0+$'; then
BASE="${{ github.event.before }}"
else
# New branch or github.event.before unavailable — run everything.
echo "run=true" >> "$GITHUB_OUTPUT"
echo "debug=new-branch-fallback" >> "$GITHUB_OUTPUT"
exit 0
fi
# GitHub Actions and Gitea Actions both expose github.sha for HEAD.
DIFF=$(git diff --name-only "$BASE" "${{ github.sha }}" 2>/dev/null)
echo "debug=diff-base=$BASE diff-files=$DIFF" >> "$GITHUB_OUTPUT"
if echo "$DIFF" | grep -qE '^workspace-server/|^canvas/|^tests/harness/|^.github/workflows/harness-replays\.yml$'; then
echo "run=true" >> "$GITHUB_OUTPUT"
else
echo "run=false" >> "$GITHUB_OUTPUT"
echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT"
fi
# ONE job that always runs. Real work is gated per-step on
@ -110,17 +91,10 @@ jobs:
run: |
echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
echo "::notice::Debug: ${{ needs.detect-changes.outputs.debug }}"
- if: needs.detect-changes.outputs.run == 'true'
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
# Log what files were detected so future failures include the diff.
- name: Log detected changes
if: needs.detect-changes.outputs.run == 'true'
run: |
echo "::notice::detect-changes debug: ${{ needs.detect-changes.outputs.debug }}"
# github-app-auth sibling-checkout removed 2026-05-07 (#157):
# the plugin was dropped + Dockerfile.tenant no longer COPYs it.

63
.github/workflows/pr-guards.yml vendored Normal file
View File

@ -0,0 +1,63 @@
name: pr-guards
# PR-time guards. Today the only guard is "disable auto-merge when a
# new commit is pushed after auto-merge was enabled" — added 2026-04-27
# after PR #2174 auto-merged with only its first commit because the
# second commit was pushed after the merge queue had locked the PR's
# SHA.
#
# Why this is inlined (not delegated to molecule-ci's reusable
# workflow): the reusable workflow uses `gh pr merge --disable-auto`,
# which calls GitHub's GraphQL API. Gitea has no GraphQL endpoint and
# returns HTTP 405 on /api/graphql, so the job failed on every Gitea
# PR push since the 2026-05-06 migration. Gitea also has no `--auto`
# merge primitive that this job could be acting on, so the right
# behaviour on Gitea is "no-op + green status" — not a 405.
#
# Inlining (vs. an `if:` on the `uses:` line) keeps the job ALWAYS
# running, which matters for branch protection: required-check names
# need a job that emits SUCCESS terminal state, not SKIPPED. See
# `feedback_branch_protection_check_name_parity` and `feedback_pr_merge_safety_guards`.
#
# Issue #88 item 1.
on:
pull_request:
types: [synchronize]
permissions:
pull-requests: write
jobs:
disable-auto-merge-on-push:
runs-on: ubuntu-latest
steps:
# Detect Gitea Actions. act_runner sets GITEA_ACTIONS=true in the
# step env on every job. Belt-and-suspenders: also check the repo
# url's host, which is independent of any runner-side env config
# (covers a future Gitea host where the env var is forgotten).
- name: Detect runner host
id: host
run: |
if [[ "${GITEA_ACTIONS:-}" == "true" ]] || [[ "${{ github.server_url }}" == *moleculesai.app* ]] || [[ "${{ github.event.repository.html_url }}" == *moleculesai.app* ]]; then
echo "is_gitea=true" >> "$GITHUB_OUTPUT"
echo "::notice::Gitea Actions detected — auto-merge gating is not applicable here (Gitea has no --auto merge primitive). Job will no-op."
else
echo "is_gitea=false" >> "$GITHUB_OUTPUT"
fi
- name: Disable auto-merge (GitHub only)
if: steps.host.outputs.is_gitea != 'true'
env:
GH_TOKEN: ${{ github.token }}
PR: ${{ github.event.pull_request.number }}
REPO: ${{ github.repository }}
NEW_SHA: ${{ github.sha }}
run: |
set -eu
gh pr merge "$PR" --disable-auto -R "$REPO" || true
gh pr comment "$PR" -R "$REPO" --body "🔒 Auto-merge disabled — new commit (\`${NEW_SHA:0:7}\`) pushed after auto-merge was enabled. The merge queue locks SHAs at entry, so subsequent pushes can race. Verify the new commit and re-enable with \`gh pr merge --auto\`."
- name: Gitea no-op
if: steps.host.outputs.is_gitea == 'true'
run: echo "Gitea Actions — auto-merge gating not applicable; no-op (job intentionally green so branch protection's required-check name lands SUCCESS)."

85
.github/workflows/promote-latest.yml vendored Normal file
View File

@ -0,0 +1,85 @@
name: promote-latest
# Manually retag ghcr.io/molecule-ai/platform:staging-<sha> → :latest
# (and the same for the tenant image). Use this to:
#
# 1. Promote a :staging-<sha> to prod before the canary fleet is live
# (one-off during the initial rollout).
# 2. Roll back :latest to a prior known-good digest after a bad
# promotion slipped past canary (use scripts/rollback-latest.sh
# for a local / emergency path; this workflow is for scheduled
# or from-browser promotions).
#
# Running this workflow needs no extra secrets — GitHub's default
# GITHUB_TOKEN has write:packages for repo-owned GHCR images, which
# is all we need for a remote retag via `crane tag`.
on:
workflow_dispatch:
inputs:
sha:
description: 'Short sha to promote (e.g. 4c1d56e). Must match an existing :staging-<sha> tag.'
required: true
type: string
permissions:
contents: read
packages: write
env:
IMAGE_NAME: ghcr.io/molecule-ai/platform
TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
jobs:
promote:
runs-on: ubuntu-latest
steps:
- uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
- name: GHCR login
run: |
echo "${{ secrets.GITHUB_TOKEN }}" \
| crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
- name: Retag platform image
run: |
set -eu
SRC="${IMAGE_NAME}:staging-${{ inputs.sha }}"
if ! crane digest "$SRC" >/dev/null 2>&1; then
echo "::error::$SRC not found in registry — double-check the sha."
exit 1
fi
EXPECTED=$(crane digest "$SRC")
crane tag "$SRC" latest
ACTUAL=$(crane digest "${IMAGE_NAME}:latest")
if [ "$ACTUAL" != "$EXPECTED" ]; then
echo "::error::retag digest mismatch (expected $EXPECTED, got $ACTUAL)"
exit 1
fi
echo "OK ${IMAGE_NAME}:latest → $ACTUAL"
- name: Retag tenant image
run: |
set -eu
SRC="${TENANT_IMAGE_NAME}:staging-${{ inputs.sha }}"
if ! crane digest "$SRC" >/dev/null 2>&1; then
echo "::error::$SRC not found — tenant image may not have built for this sha."
exit 1
fi
EXPECTED=$(crane digest "$SRC")
crane tag "$SRC" latest
ACTUAL=$(crane digest "${TENANT_IMAGE_NAME}:latest")
if [ "$ACTUAL" != "$EXPECTED" ]; then
echo "::error::tenant retag digest mismatch"
exit 1
fi
echo "OK ${TENANT_IMAGE_NAME}:latest → $ACTUAL"
- name: Summary
run: |
{
echo "## :latest promoted to staging-${{ inputs.sha }}"
echo
echo "Both platform + tenant images retagged. Prod tenants"
echo "will auto-pull within their 5-min update cycle."
} >> "$GITHUB_STEP_SUMMARY"

View File

@ -54,22 +54,6 @@ jobs:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
# Health check: verify Docker daemon is accessible before attempting any
# build steps. This fails loudly at step 1 when the runner's docker.sock
# is inaccessible rather than silently continuing to the build step
# where docker build fails deep in ECR auth with a cryptic error.
- name: Verify Docker daemon access
run: |
set -euo pipefail
echo "::group::Docker daemon health check"
docker info 2>&1 | head -5 || {
echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
exit 1
}
echo "Docker daemon OK"
echo "::endgroup::"
- name: Compute tags
id: tags
shell: bash

436
.github/workflows/publish-runtime.yml vendored Normal file
View File

@ -0,0 +1,436 @@
name: publish-runtime
# Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/.
# Monorepo workspace/ is the only source-of-truth for runtime code; this
# workflow is the bridge from monorepo edits to the PyPI artifact that
# the 8 workspace-template-* repos depend on.
#
# Triggered by:
# - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from
# the tag — `runtime-v0.1.6` publishes `0.1.6`).
# - Manual workflow_dispatch with an explicit `version` input (useful for
# dev/test releases without tagging the repo).
# - Auto: any push to `staging` that touches `workspace/**`. The version
# is derived by querying PyPI for the current latest and bumping the
# patch component. This closes the human-in-loop gap that caused the
# 2026-04-27 RuntimeCapabilities ImportError outage — adapter symbol
# additions in workspace/adapters/base.py used to require an operator
# to remember to publish; now the merge itself triggers the publish.
#
# The workflow:
# 1. Runs scripts/build_runtime_package.py to copy workspace/ →
# build/molecule_runtime/ with imports rewritten (`a2a_client` →
# `molecule_runtime.a2a_client`).
# 2. Builds wheel + sdist with `python -m build`.
# 3. Publishes to PyPI via the PyPA Trusted Publisher action (OIDC).
# No static API token is stored — PyPI verifies the workflow's
# OIDC claim against the trusted-publisher config registered for
# molecule-ai-workspace-runtime (molecule-ai/molecule-core,
# publish-runtime.yml, environment pypi-publish).
#
# After publish: the 8 template repos pick up the new version on their
# next image rebuild (their requirements.txt pin
# `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible).
# To force-pull immediately, bump the pin in each template repo's
# requirements.txt and merge — that triggers their own publish-image.yml.
on:
push:
tags:
- "runtime-v*"
branches:
- staging
paths:
# Auto-publish when staging gets changes that affect what gets
# published. Path filter ONLY applies to branch pushes — tag pushes
# still fire regardless.
#
# workspace/** is the source-of-truth for runtime code.
# scripts/build_runtime_package.py is the build script — changes to
# it (e.g. a fix to the import rewriter or a manifest emit) directly
# affect what ships in the wheel even if no workspace/ file changes.
# The 2026-04-27 lib/ subpackage incident missed an auto-publish for
# exactly this reason — PR #2174 only changed scripts/ and the
# operator had to remember a manual dispatch.
- "workspace/**"
- "scripts/build_runtime_package.py"
workflow_dispatch:
inputs:
version:
description: "Version to publish (e.g. 0.1.6). Required for manual dispatch."
required: true
type: string
permissions:
contents: read
# Serialize publishes so two staging merges landing seconds apart don't
# both compute "latest+1" and race on PyPI upload. The second one waits.
concurrency:
group: publish-runtime
cancel-in-progress: false
jobs:
publish:
runs-on: ubuntu-latest
environment: pypi-publish
permissions:
contents: read
id-token: write # PyPI Trusted Publisher (OIDC) — no PYPI_TOKEN needed
outputs:
version: ${{ steps.version.outputs.version }}
wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.11"
cache: pip
- name: Derive version (tag, manual input, or PyPI auto-bump)
id: version
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
VERSION="${{ inputs.version }}"
elif echo "$GITHUB_REF_NAME" | grep -q "^runtime-v"; then
# Tag is `runtime-vX.Y.Z` — strip the prefix.
VERSION="${GITHUB_REF_NAME#runtime-v}"
else
# Auto-publish from staging push. Query PyPI for the current
# latest and bump the patch component. concurrency: group above
# serializes parallel staging merges so we don't race on the
# bump. If PyPI is unreachable, fail loud — better to skip a
# publish than to overwrite an existing version.
LATEST=$(curl -fsS --retry 3 https://pypi.org/pypi/molecule-ai-workspace-runtime/json \
| python -c "import sys,json; print(json.load(sys.stdin)['info']['version'])")
MAJOR=$(echo "$LATEST" | cut -d. -f1)
MINOR=$(echo "$LATEST" | cut -d. -f2)
PATCH=$(echo "$LATEST" | cut -d. -f3)
VERSION="${MAJOR}.${MINOR}.$((PATCH+1))"
echo "Auto-bumped from PyPI latest $LATEST -> $VERSION"
fi
if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
echo "::error::version $VERSION does not match PEP 440"
exit 1
fi
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
echo "Publishing molecule-ai-workspace-runtime $VERSION"
- name: Install build tooling
run: pip install build twine
- name: Build package from workspace/
run: |
python scripts/build_runtime_package.py \
--version "${{ steps.version.outputs.version }}" \
--out "${{ runner.temp }}/runtime-build"
- name: Build wheel + sdist
working-directory: ${{ runner.temp }}/runtime-build
run: python -m build
- name: Capture wheel SHA256 for cascade content-verification
# Recorded BEFORE upload so the cascade probe can verify the
# bytes Fastly serves under the new version's URL match what
# we built. Closes a hole left by #2197: that probe verified
# pip can resolve the version (catches propagation lag) but
# not that the wheel content matches (would silently pass a
# Fastly stale-content scenario where the new version's URL
# serves an old wheel binary).
id: wheel_hash
working-directory: ${{ runner.temp }}/runtime-build
run: |
set -eu
WHEEL=$(ls dist/*.whl 2>/dev/null | head -1)
if [ -z "$WHEEL" ]; then
echo "::error::No .whl in dist/ — `python -m build` must have failed silently"
exit 1
fi
HASH=$(sha256sum "$WHEEL" | awk '{print $1}')
echo "wheel_sha256=${HASH}" >> "$GITHUB_OUTPUT"
echo "Local wheel SHA256 (pre-upload): ${HASH}"
echo "Wheel filename: $(basename "$WHEEL")"
- name: Verify package contents (sanity)
working-directory: ${{ runner.temp }}/runtime-build
# Smoke logic lives in scripts/wheel_smoke.py so the same gate runs
# at both PR-time (runtime-prbuild-compat.yml) and publish-time
# (here). Splitting the smoke across two heredocs let them drift
# apart historically — one script keeps them locked.
run: |
python -m twine check dist/*
python -m venv /tmp/smoke
/tmp/smoke/bin/pip install --quiet dist/*.whl
/tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
- name: Publish to PyPI (Trusted Publisher / OIDC)
# PyPI side is configured: project molecule-ai-workspace-runtime →
# publisher molecule-ai/molecule-core, workflow publish-runtime.yml,
# environment pypi-publish. The action mints a short-lived OIDC
# token and exchanges it for a PyPI upload credential — no static
# API token in this repo's secrets.
uses: pypa/gh-action-pypi-publish@release/v1
with:
packages-dir: ${{ runner.temp }}/runtime-build/dist/
cascade:
# After PyPI accepts the upload, fan out a repository_dispatch to each
# template repo so they rebuild their image against the new runtime.
# Each template's `runtime-published.yml` receiver picks up the event,
# pulls the new PyPI version (their requirements.txt pin is `>=`), and
# republishes ghcr.io/molecule-ai/workspace-template-<runtime>:latest.
#
# Soft-fail per repo: if one template's dispatch fails (perms missing,
# repo archived, etc.) we still try the others and surface the failures
# in the workflow summary instead of aborting the whole cascade.
needs: publish
runs-on: ubuntu-latest
steps:
- name: Wait for PyPI to propagate the new version
# PyPI accepts the upload, then takes a few seconds to make the
# new version visible across all THREE surfaces pip touches:
# 1. /pypi/<pkg>/<ver>/json — metadata endpoint
# 2. /simple/<pkg>/ — pip's primary download index
# 3. files.pythonhosted.org — CDN-fronted wheel binary
# Each has its own cache. The previous check polled only (1)
# and would let the cascade fire while (2) or (3) still served
# the previous version, so downstream `pip install` resolved
# to the old wheel. Docker layer cache then locked that stale
# resolution in for subsequent rebuilds (the cache trap that
# bit us five times in one night).
#
# Two-stage probe per poll:
# (a) `pip install --no-cache-dir PACKAGE==VERSION` — succeeds
# only when the version is resolvable. Catches surface (1)
# and (2) propagation lag.
# (b) `pip download` of the same wheel + SHA256 compare against
# the just-built dist's hash. Catches surface (3) lag AND
# Fastly serving stale content under the new version's URL
# (a separate Fastly-corruption mode that pip-install alone
# can't see, since pip install resolves+unpacks against
# whatever bytes Fastly returns and never inspects them).
# Both must pass before the cascade fans out.
#
# The venv is reused across polls; only `pip install`/`pip
# download` run in the loop, with --force-reinstall +
# --no-cache-dir so the previous poll's cached state doesn't
# mask propagation lag.
env:
RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
EXPECTED_SHA256: ${{ needs.publish.outputs.wheel_sha256 }}
run: |
set -eu
if [ -z "$EXPECTED_SHA256" ]; then
echo "::error::publish job did not expose wheel_sha256 — cannot verify wheel content. Refusing to fan out cascade."
exit 1
fi
python -m venv /tmp/propagation-probe
PROBE=/tmp/propagation-probe/bin
$PROBE/pip install --upgrade --quiet pip
# Poll budget: 30 attempts × (~3-5s pip install + ~3s pip
# download + 4s sleep) ≈ 5-6 min wall on a slow GH runner.
# Generous vs PyPI's typical few-seconds propagation;
# failures past this are signal of a real PyPI / Fastly
# issue, not just lag.
for i in $(seq 1 30); do
# Stage (a): can pip resolve and install the version?
if $PROBE/pip install \
--quiet \
--no-cache-dir \
--force-reinstall \
--no-deps \
"molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
>/dev/null 2>&1; then
INSTALLED=$($PROBE/pip show molecule-ai-workspace-runtime 2>/dev/null \
| awk -F': ' '/^Version:/{print $2}')
if [ "$INSTALLED" = "$RUNTIME_VERSION" ]; then
# Stage (b): does Fastly serve the bytes we uploaded?
# `pip download` writes the actual .whl file to disk so
# we can sha256sum it (vs `pip install` which unpacks
# and discards).
rm -rf /tmp/probe-dl
mkdir -p /tmp/probe-dl
if $PROBE/pip download \
--quiet \
--no-cache-dir \
--no-deps \
--dest /tmp/probe-dl \
"molecule-ai-workspace-runtime==${RUNTIME_VERSION}" \
>/dev/null 2>&1; then
WHEEL=$(ls /tmp/probe-dl/*.whl 2>/dev/null | head -1)
if [ -n "$WHEEL" ]; then
ACTUAL=$(sha256sum "$WHEEL" | awk '{print $1}')
if [ "$ACTUAL" = "$EXPECTED_SHA256" ]; then
echo "::notice::✓ pip resolves AND wheel content matches after ${i} poll(s) (sha256=${EXPECTED_SHA256})"
exit 0
fi
# Hash mismatch: PyPI accepted our upload but Fastly
# is serving different bytes under the version's URL.
# Most often this is propagation lag of the BINARY
# surface — the version is resolvable but the wheel
# cache hasn't caught up. Retry.
echo "::warning::poll ${i}: wheel content mismatch (got ${ACTUAL:0:12}…, want ${EXPECTED_SHA256:0:12}…) — Fastly likely still serving stale binary, retrying"
fi
fi
fi
fi
sleep 4
done
echo "::error::pip never resolved molecule-ai-workspace-runtime==${RUNTIME_VERSION} with matching wheel content within ~5 min."
echo "::error::Expected wheel SHA256: ${EXPECTED_SHA256}"
echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces."
exit 1
- name: Fan out via push to .runtime-version
env:
# Gitea PAT with write:repository scope on the 8 cascade-active
# template repos. Used here for `git push` (NOT for an API
# dispatch — Gitea 1.22.6 has no repository_dispatch endpoint;
# empirically verified across 6 candidate paths in molecule-
# core#20 issuecomment-913). The push trips each template's
# existing `on: push: branches: [main]` trigger on
# publish-image.yml, which then reads the updated
# .runtime-version via its resolve-version job.
DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
run: |
set +e # don't abort on a single repo failure — collect them all
# Soft-skip on workflow_dispatch when the token is missing
# (operator ad-hoc test); hard-fail on push so unattended
# publishes can't silently skip the cascade. Same shape as
# the original v1, intentional split per the schedule-vs-
# dispatch hardening 2026-04-28.
if [ -z "$DISPATCH_TOKEN" ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
echo "::warning::set it at Settings → Secrets and Variables → Actions, then rerun. Templates will stay on the prior runtime version until either this token is set or each template is rebuilt manually."
exit 0
fi
echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version until this token is restored and a republish dispatches the cascade."
echo "::error::set it at Settings → Secrets and Variables → Actions; then re-trigger publish-runtime via workflow_dispatch."
exit 1
fi
VERSION="$RUNTIME_VERSION"
if [ -z "$VERSION" ]; then
echo "::error::publish job did not expose a version output — cascade cannot fan out"
exit 1
fi
# All 9 workspace templates declared in manifest.json. The list
# MUST stay aligned with manifest.json's workspace_templates —
# cascade-list-drift-gate.yml enforces this in CI per the
# codex-stuck-on-stale-runtime invariant from PR #2556.
# Long-term goal: derive this list from manifest.json so it
# can't drift even on a manifest edit (RFC #388 Phase-1).
#
# Per-template publish-image.yml presence is checked at
# cascade-time below: codex doesn't ship one today, so the
# cascade soft-skips it with an informational message rather
# than dropping it from this list (which would re-introduce
# the drift the gate exists to catch).
GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
FAILED=""
SKIPPED=""
# Configure git identity once. The persona owning DISPATCH_TOKEN
# is the same identity that authored this commit on each
# template; using a generic "publish-runtime cascade" co-author
# trailer in the message keeps the audit trail honest about the
# workflow-driven origin.
git config --global user.name "publish-runtime cascade"
git config --global user.email "publish-runtime@moleculesai.app"
WORKDIR="$(mktemp -d)"
for tpl in $TEMPLATES; do
REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
CLONE="$WORKDIR/$tpl"
# Pre-check: skip templates without a publish-image.yml.
# The cascade's job is to trip the template's on-push
# rebuild — if there's no rebuild workflow, pushing a
# .runtime-version commit is just noise on the target
# repo. Use the Gitea contents API (no clone required for
# the probe). 200 = present; 404 = absent.
HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
-H "Authorization: token $DISPATCH_TOKEN" \
"$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
if [ "$HTTP" = "404" ]; then
echo "↷ $tpl has no publish-image.yml — soft-skip (informational; manifest still tracks it)"
SKIPPED="$SKIPPED $tpl"
continue
fi
if [ "$HTTP" != "200" ]; then
echo "::warning::$tpl publish-image.yml probe returned HTTP $HTTP — proceeding anyway, push will surface the real failure if any"
fi
# Use a per-template attempt loop so a transient race (e.g.
# human pushing to the same template at the same instant)
# doesn't lose the cascade. Bounded retries (3) — beyond
# that we surface the failure and let the operator retry.
attempt=0
success=false
while [ $attempt -lt 3 ]; do
attempt=$((attempt + 1))
rm -rf "$CLONE"
if ! git clone --depth=1 \
"https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
"$CLONE" >/tmp/clone.log 2>&1; then
echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
sleep 2
continue
fi
cd "$CLONE"
echo "$VERSION" > .runtime-version
# Idempotency guard: if the file already matches, this
# publish is a re-run for a version already cascaded.
# Don't push a no-op commit (would spuriously re-trip the
# template's on-push and rebuild for nothing).
if git diff --quiet -- .runtime-version; then
echo "✓ $tpl already at $VERSION — no commit needed (idempotent)"
success=true
cd - >/dev/null
break
fi
git add .runtime-version
git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
-m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
>/dev/null
if git push origin HEAD:main >/tmp/push.log 2>&1; then
echo "✓ $tpl pushed $VERSION on attempt $attempt"
success=true
cd - >/dev/null
break
fi
# Likely a non-fast-forward — pull-rebase and retry.
# Don't force-push: that would silently overwrite a racing
# human/cascade commit.
echo "::warning::push $tpl attempt $attempt failed, pull-rebasing: $(tail -n3 /tmp/push.log)"
git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
cd - >/dev/null
done
if [ "$success" != "true" ]; then
FAILED="$FAILED $tpl"
fi
done
rm -rf "$WORKDIR"
if [ -n "$FAILED" ]; then
echo "::error::Cascade incomplete after 3 retries each. Failed templates:$FAILED"
echo "::error::PyPI publish succeeded; failed templates lag the new version. Re-run this workflow_dispatch with the same version to retry only the laggers (idempotent — already-cascaded templates skip)."
exit 1
fi
if [ -n "$SKIPPED" ]; then
echo "Cascade complete: pinned $VERSION on cascade-active templates. Soft-skipped (no publish-image.yml):$SKIPPED"
else
echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
fi

View File

@ -0,0 +1,262 @@
name: publish-workspace-server-image
# Builds and pushes Docker images to GHCR on staging or main pushes.
# EC2 tenant instances pull the tenant image from GHCR.
#
# Branch / tag policy (see Compute tags step for the per-branch logic):
#
# staging push → builds image, tags :staging-<sha> + :staging-latest.
# staging-CP pins TENANT_IMAGE=:staging-latest, so it
# picks up staging-branch code automatically. This is
# what makes staging-CP actually test staging-branch
# code instead of "yesterday's main" — pre-fix, this
# workflow only ran on main, so staging tenants
# silently served stale code (#2308 fix RFC #2312
# landed on staging but never reached tenants because
# staging→main was wedged on path-filter parity bugs).
#
# main push → builds image, tags :staging-<sha> + :staging-latest
# (same as before). canary-verify.yml retags
# :staging-<sha> → :latest after canary tenants
# green-light the digest. The :staging-latest retag
# on main push is intentional: when main lands AFTER a
# staging push, staging-CP gets the post-promote code
# (which equals what it had + any merge resolution),
# so the canary-on-staging-CP step still runs against
# the prod-bound digest.
#
# In the steady state both branches refresh :staging-latest; the
# semantic is "most recent staging-or-main build of tenant code."
# Drift between the two is bounded by the staging→main auto-promote
# cadence and is corrected on the next staging push.
on:
push:
branches: [staging, main]
paths:
- 'workspace-server/**'
- 'canvas/**'
- 'manifest.json'
- 'scripts/**'
- '.github/workflows/publish-workspace-server-image.yml'
workflow_dispatch:
# Serialize per-branch so two rapid staging pushes don't race the same
# :staging-latest tag retag. Allow staging and main to run in parallel
# (different github.ref → different concurrency group) since they
# produce different :staging-<sha> tags and last-write-wins on
# :staging-latest is acceptable across branches (the post-promote
# main code equals current staging code in a healthy flow).
#
# cancel-in-progress: false → in-flight builds finish; the next push's
# build queues. This avoids a partially-pushed image and keeps the
# canary fleet pin (:staging-<sha>) consistent with what was actually
# tested at canary-verify time.
concurrency:
group: publish-workspace-server-image-${{ github.ref }}
cancel-in-progress: false
permissions:
contents: read
packages: write
env:
IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant
jobs:
build-and-push:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
# github-app-auth sibling-checkout removed 2026-05-07 (#157):
# plugin was dropped + workspace-server/Dockerfile no longer
# COPYs it.
# ECR auth + buildx setup are now inline in each build step
# below (Task #173, 2026-05-07).
#
# Why moved inline: aws-actions/configure-aws-credentials@v4 +
# aws-actions/amazon-ecr-login@v2 + docker/setup-buildx-action
# all left auth state in places that the actual `docker push`
# couldn't see on Gitea Actions:
# - The actions wrote to a step-scoped DOCKER_CONFIG path
# that didn't survive into subsequent shell steps.
# - Buildx couldn't bridge the runner container ↔
# operator-host docker daemon auth gap (401 on the
# docker-container driver, "no basic auth credentials"
# with the action-driven login).
#
# Doing AWS+ECR auth inline (`aws ecr get-login-password |
# docker login`) in the same shell step as `docker build` +
# `docker push` is the operator-host manual approach, mapped
# 1:1 into CI. Auth state is guaranteed to live in the env that
# `docker push` actually runs from.
#
# Post-suspension target is the operator's ECR org
# (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*),
# which already hosts platform-tenant + workspace-template-* +
# runner-base images. AWS creds come from the
# AWS_ACCESS_KEY_ID/SECRET secrets bound to the molecule-cp
# IAM user. Closes #161.
- name: Compute tags
id: tags
run: |
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
# Pre-clone manifest deps before docker build (Task #173 fix).
#
# Why pre-clone: post-2026-05-06, every workspace-template-* repo on
# Gitea (codex, crewai, deepagents, gemini-cli, langgraph) plus all
# 7 org-template-* repos are private. The pre-fix Dockerfile.tenant
# ran `git clone` inside an in-image stage, which had no auth path
# — every CI build failed with "fatal: could not read Username for
# https://git.moleculesai.app". For weeks, every workspace-server
# rebuild required a manual operator-host push. Now we clone in the
# trusted CI context (where AUTO_SYNC_TOKEN is naturally available)
# and Dockerfile.tenant just COPYs from .tenant-bundle-deps/.
#
# Token shape: AUTO_SYNC_TOKEN is the devops-engineer persona PAT
# (see /etc/molecule-bootstrap/agent-secrets.env). Per saved memory
# `feedback_per_agent_gitea_identity_default`, every CI surface uses
# a per-persona token, never the founder PAT. clone-manifest.sh
# embeds it as basic-auth (oauth2:<token>) for the duration of the
# clones, then strips .git directories — the token never enters
# the resulting image.
#
# Idempotent: if a re-run finds populated dirs, clone-manifest.sh
# skips them; safe to retrigger via path-filter or workflow_dispatch.
- name: Pre-clone manifest deps
env:
MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
run: |
set -euo pipefail
if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
exit 1
fi
mkdir -p .tenant-bundle-deps
bash scripts/clone-manifest.sh \
manifest.json \
.tenant-bundle-deps/workspace-configs-templates \
.tenant-bundle-deps/org-templates \
.tenant-bundle-deps/plugins
# Sanity-check counts so a silent partial clone fails fast
# instead of producing a half-empty image.
ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
# Counts are derived from manifest.json (9 ws / 7 org / 21
# plugins as of 2026-05-07). If manifest.json grows but the
# clone step regresses silently, the find above caps at the
# actual disk state — but clone-manifest.sh's own EXPECTED vs
# CLONED check (line ~95) is the authoritative fail-fast.
# Canary-gated release flow:
# - This step always publishes :staging-<sha> + :staging-latest.
# - On staging push, staging-CP picks up :staging-latest immediately
# (its TENANT_IMAGE pin is :staging-latest) — so staging-branch
# code reaches staging tenants without waiting for main.
# - On main push, canary-verify.yml runs smoke tests against
# canary tenants (which pin :staging-<sha>), and on green retags
# :staging-<sha> → :latest. Prod tenants pull :latest.
# - On red, :latest stays on the prior good digest — prod is safe.
#
# Why :staging-latest is retagged on main push too: when main lands
# after a staging promote, staging-CP gets the post-promote code so
# the canary-on-staging-CP step still runs against the prod-bound
# digest. In a healthy flow the post-promote main code == the
# current staging code, so this is effectively a no-op except for
# the canary fleet pin handoff.
#
# Pre-fix history: this workflow used to only trigger on main. That
# meant staging-CP served "yesterday's main" indefinitely whenever
# staging→main was wedged. The 2026-04-30 dogfooding session
# surfaced this when RFC #2312 (chat upload HTTP-forward) landed on
# staging but staging tenants kept failing chat upload because they
# were running pre-RFC code. Adding the staging trigger above closes
# that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
# drifted 10 days behind staging — same class of bug, different
# mechanism. ECR repo molecule-ai/platform created 2026-05-07.
# Build + push platform image with plain `docker` (no buildx).
# GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
# returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
# The OCI revision label below carries the same value for registry
# tooling; the duplication is intentional.
- name: Build & push platform image to ECR (staging-<sha> + staging-latest)
env:
IMAGE_NAME: ${{ env.IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
TAG_LATEST: staging-latest
GIT_SHA: ${{ github.sha }}
REPO: ${{ github.repository }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
run: |
set -euo pipefail
# ECR auth in-step so config.json is populated in the same
# shell env that runs `docker push`. ECR get-login-password
# tokens last 12h, plenty for a single-step build+push.
ECR_REGISTRY="${IMAGE_NAME%%/*}"
aws ecr get-login-password --region us-east-2 | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
docker build \
--file ./workspace-server/Dockerfile \
--build-arg GIT_SHA="${GIT_SHA}" \
--label "org.opencontainers.image.source=https://github.com/${REPO}" \
--label "org.opencontainers.image.revision=${GIT_SHA}" \
--label "org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify" \
--tag "${IMAGE_NAME}:${TAG_SHA}" \
--tag "${IMAGE_NAME}:${TAG_LATEST}" \
.
docker push "${IMAGE_NAME}:${TAG_SHA}"
docker push "${IMAGE_NAME}:${TAG_LATEST}"
# Canvas uses same-origin fetches. The tenant Go platform
# reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
# env; the tenant's /canvas/viewport, /approvals/pending,
# /org/templates etc. live on the tenant platform itself.
# Both legs share one origin (the tenant subdomain) so
# PLATFORM_URL="" forces canvas to fetch paths as relative,
# which land same-origin.
#
# Self-hosted / private-label deployments override this at
# build time with a specific backend (e.g. local dev:
# NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
- name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
env:
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
TAG_LATEST: staging-latest
GIT_SHA: ${{ github.sha }}
REPO: ${{ github.repository }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
run: |
set -euo pipefail
# Re-login: the platform-image step's docker login wrote to
# the same config.json, so this is technically redundant — but
# making each push step self-contained keeps the workflow
# robust to step reordering / future extraction.
ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
aws ecr get-login-password --region us-east-2 | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
docker build \
--file ./workspace-server/Dockerfile.tenant \
--build-arg NEXT_PUBLIC_PLATFORM_URL= \
--build-arg GIT_SHA="${GIT_SHA}" \
--label "org.opencontainers.image.source=https://github.com/${REPO}" \
--label "org.opencontainers.image.revision=${GIT_SHA}" \
--label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \
--tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
--tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
.
docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}"
docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"

View File

@ -3,9 +3,9 @@ name: redeploy-tenants-on-main
# Auto-refresh prod tenant EC2s after every main merge.
#
# Why this workflow exists: publish-workspace-server-image builds and
# pushes a new platform-tenant :<sha> to ECR on every merge to main,
# but running tenants pulled their image once at boot and never re-pull.
# Users see stale code indefinitely.
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
# to main, but running tenants pulled their image once at boot and
# never re-pull. Users see stale code indefinitely.
#
# This workflow closes the gap by calling the control-plane admin
# endpoint that performs a canary-first, batched, health-gated rolling
@ -13,18 +13,12 @@ name: redeploy-tenants-on-main
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
# (feat/tenant-auto-redeploy, landing alongside this workflow).
#
# Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/
# molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the
# Gitea suspension migration. The canary-verify.yml promote step now
# uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap).
#
# Runtime ordering:
# 1. publish-workspace-server-image completes → new :staging-<sha> in ECR.
# 2. This workflow fires via workflow_run, calls redeploy-fleet with
# target_tag=staging-<sha>. No CDN propagation wait needed —
# ECR image manifest is consistent immediately after push.
# 3. Calls redeploy-fleet with canary_slug (if set) and a soak
# period. Canary proves the image boots; batches follow.
# 1. publish-workspace-server-image completes → new :latest in GHCR.
# 2. This workflow fires via workflow_run, waits 30s for GHCR's
# CDN to propagate the new tag to the region the tenants pull from.
# 3. Calls redeploy-fleet with canary_slug=hongming and a 60s
# soak. Canary proves the image boots; batches follow.
# 4. Any failure aborts the rollout and leaves older tenants on the
# prior image — safer default than half-and-half state.
#
@ -114,11 +108,13 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 25
steps:
- name: Note on ECR propagation
# ECR image manifests are consistent immediately after push — no
# CDN cache to wait for. The old GHCR-based workflow had a 30s
# sleep to avoid race conditions; ECR makes that unnecessary.
run: echo "ECR image available immediately after push — proceeding."
- name: Wait for GHCR tag propagation
# GHCR's edge cache takes ~15-30s to consistently serve the new
# manifest after the registry accepts the push. Without this
# sleep, the first tenant's docker pull sometimes races and
# fetches the previous digest; sleeping is the cheapest way to
# reduce that without polling GHCR for the new digest.
run: sleep 30
- name: Compute target tag
id: tag

View File

@ -48,7 +48,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/checkout@v6
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:

214
.github/workflows/secret-scan.yml vendored Normal file
View File

@ -0,0 +1,214 @@
name: Secret scan
# Hard CI gate. Refuses any PR / push whose diff additions contain a
# recognisable credential. Defense-in-depth for the #2090-class incident
# (2026-04-24): GitHub's hosted Copilot Coding Agent leaked a ghs_*
# installation token into tenant-proxy/package.json via `npm init`
# slurping the URL from a token-embedded origin remote. We can't fix
# upstream's clone hygiene, so we gate here.
#
# Also the canonical reusable workflow for the rest of the org. Other
# Molecule-AI repos enroll with a single 3-line workflow:
#
# jobs:
# secret-scan:
# uses: molecule-ai/molecule-core/.github/workflows/secret-scan.yml@staging
#
# Pin to @staging not @main — staging is the active default branch,
# main lags via the staging-promotion workflow. Updates ride along
# automatically on the next consumer workflow run.
#
# Same regex set as the runtime's bundled pre-commit hook
# (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh).
# Keep the two sides aligned when adding patterns.
on:
pull_request:
types: [opened, synchronize, reopened]
push:
branches: [main, staging]
# Required for GitHub merge queue: the queue's pre-merge CI run on
# `gh-readonly-queue/...` refs needs this check to fire so the queue
# gets a real result instead of stalling forever AWAITING_CHECKS.
merge_group:
types: [checks_requested]
# Reusable workflow entry point for other Molecule-AI repos.
workflow_call:
jobs:
scan:
name: Scan diff for credential-shaped strings
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 2 # need previous commit to diff against on push events
# For pull_request events the diff base may be many commits behind
# HEAD and absent from the shallow clone. Fetch it explicitly.
- name: Fetch PR base SHA (pull_request events only)
if: github.event_name == 'pull_request'
run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
# For merge_group events the queue's pre-merge ref is a commit on
# `gh-readonly-queue/...` whose parent is the queue's base_sha.
# That parent isn't part of the queue branch's shallow clone, so
# we fetch it explicitly. Without this the diff falls through to
# "no BASE → scan entire tree" mode and false-positives on legit
# test fixtures (e.g. canvas/src/lib/validation/__tests__/secret-formats.test.ts).
- name: Fetch merge_group base SHA (merge_group events only)
if: github.event_name == 'merge_group'
run: git fetch --depth=1 origin ${{ github.event.merge_group.base_sha }}
- name: Refuse if credential-shaped strings appear in diff additions
env:
# Plumb event-specific SHAs through env so the script doesn't
# need conditional `${{ ... }}` interpolation per event type.
# github.event.before/after only exist on push events;
# merge_group has its own base_sha/head_sha; pull_request has
# pull_request.base.sha / pull_request.head.sha.
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
MG_BASE_SHA: ${{ github.event.merge_group.base_sha }}
MG_HEAD_SHA: ${{ github.event.merge_group.head_sha }}
PUSH_BEFORE: ${{ github.event.before }}
PUSH_AFTER: ${{ github.event.after }}
run: |
# Pattern set covers GitHub family (the actual #2090 vector),
# Anthropic / OpenAI / Slack / AWS. Anchored on prefixes with low
# false-positive rates against agent-generated content. Mirror of
# molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh
# — keep aligned.
SECRET_PATTERNS=(
'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic)
'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token
'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server
'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user
'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh
'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT
'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key
'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key
'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key
'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact)
'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens
'AKIA[0-9A-Z]{16}' # AWS access key ID
'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID
)
# Determine the diff base. Each event type stores its SHAs in
# a different place — see the env block above.
case "${{ github.event_name }}" in
pull_request)
BASE="$PR_BASE_SHA"
HEAD="$PR_HEAD_SHA"
;;
merge_group)
BASE="$MG_BASE_SHA"
HEAD="$MG_HEAD_SHA"
;;
*)
BASE="$PUSH_BEFORE"
HEAD="$PUSH_AFTER"
;;
esac
# On push events with shallow clones, BASE may be present in
# the event payload but absent from the local object DB
# (fetch-depth=2 doesn't always reach the previous commit
# across true merges). Try fetching it on demand. If the
# fetch fails — e.g. the SHA was force-overwritten — we fall
# through to the empty-BASE branch below, which scans the
# entire tree as if every file were new. Correct, just slow.
if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
if ! git cat-file -e "$BASE" 2>/dev/null; then
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
fi
fi
# Files added or modified in this change.
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
# New branch / no previous SHA / BASE unreachable — check the
# entire tree as added content. Slower, but correct on first
# push.
CHANGED=$(git ls-tree -r --name-only HEAD)
DIFF_RANGE=""
else
CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
DIFF_RANGE="$BASE $HEAD"
fi
if [ -z "$CHANGED" ]; then
echo "No changed files to inspect."
exit 0
fi
# Self-exclude: this workflow file legitimately contains the
# pattern strings as regex literals. Without an exclude it would
# block its own merge.
SELF=".github/workflows/secret-scan.yml"
OFFENDING=""
# `while IFS= read -r` (not `for f in $CHANGED`) so filenames
# containing whitespace don't word-split silently — a path
# with a space would otherwise produce two iterations on
# tokens that aren't real filenames, breaking the
# self-exclude + diff lookup.
while IFS= read -r f; do
[ -z "$f" ] && continue
[ "$f" = "$SELF" ] && continue
if [ -n "$DIFF_RANGE" ]; then
ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
else
# No diff range (new branch first push) — scan the full file
# contents as if every line were new.
ADDED=$(cat "$f" 2>/dev/null || true)
fi
[ -z "$ADDED" ] && continue
for pattern in "${SECRET_PATTERNS[@]}"; do
if echo "$ADDED" | grep -qE "$pattern"; then
OFFENDING="${OFFENDING}${f} (matched: ${pattern})\n"
break
fi
done
done <<< "$CHANGED"
if [ -n "$OFFENDING" ]; then
echo "::error::Credential-shaped strings detected in diff additions:"
# `printf '%b' "$OFFENDING"` interprets backslash escapes
# (the literal `\n` we appended above becomes a newline)
# WITHOUT treating OFFENDING as a format string. Plain
# `printf "$OFFENDING"` is a format-string sink: a filename
# containing `%` would be interpreted as a conversion
# specifier, corrupting the error message (or printing
# `%(missing)` artifacts).
printf '%b' "$OFFENDING"
echo ""
echo "The actual matched values are NOT echoed here, deliberately —"
echo "round-tripping a leaked credential into CI logs widens the blast"
echo "radius (logs are searchable + retained)."
echo ""
echo "Recovery:"
echo " 1. Remove the secret from the file. Replace with an env var"
echo " reference (e.g. \${{ secrets.GITHUB_TOKEN }} in workflows,"
echo " process.env.X in code)."
echo " 2. If the credential was already pushed (this PR's commit"
echo " history reaches a public ref), treat it as compromised —"
echo " ROTATE it immediately, do not just remove it. The token"
echo " remains valid in git history forever and may be in any"
echo " log/cache that consumed this branch."
echo " 3. Force-push the cleaned commit (or stack a revert) and"
echo " re-run CI."
echo ""
echo "If the match is a false positive (test fixture, docs example,"
echo "or this workflow's own regex literals): use a clearly-fake"
echo "placeholder like ghs_EXAMPLE_DO_NOT_USE that doesn't satisfy"
echo "the length suffix, OR add the file path to the SELF exclude"
echo "list in this workflow with a short reason."
echo ""
echo "Mirror of the regex set lives in the runtime's bundled"
echo "pre-commit hook (molecule-ai-workspace-runtime:"
echo "molecule_runtime/scripts/pre-commit-checks.sh) — keep aligned."
exit 1
fi
echo "✓ No credential-shaped strings in this change."

View File

@ -284,7 +284,7 @@ cp .env.example .env
./infra/scripts/setup.sh
# Boots Postgres (:5432), Redis (:6379), Langfuse (:3001),
# and Temporal (:7233 gRPC, :8233 UI) on the shared
# `molecule-core-net` Docker network. Temporal runs with
# `molecule-monorepo-net` Docker network. Temporal runs with
# no auth on localhost — dev-only; production must gate it.
#
# Also populates the template/plugin registry by cloning every repo

View File

@ -283,7 +283,7 @@ cp .env.example .env
./infra/scripts/setup.sh
# 启动 Postgres (:5432)、Redis (:6379)、Langfuse (:3001)
# 以及 Temporal (:7233 gRPC, :8233 UI),全部挂在共享的
# `molecule-core-net` Docker 网络上。Temporal 默认无鉴权,
# `molecule-monorepo-net` Docker 网络上。Temporal 默认无鉴权,
# 仅用于本地开发;生产环境必须加 mTLS / API Key。
#
# 同时会根据 manifest.json 拉取所有模板/插件仓库到

View File

@ -1,4 +1,4 @@
FROM node:22-alpine@sha256:cb15fca92530d7ac113467696cf1001208dac49c3c64355fd1348c11a88ddf8f AS builder
FROM node:22-alpine AS builder
WORKDIR /app
COPY package.json package-lock.json* ./
# `npm ci` (not `install`) for lockfile-exact reproducibility.
@ -15,7 +15,7 @@ ENV NEXT_PUBLIC_WS_URL=$NEXT_PUBLIC_WS_URL
ENV NEXT_PUBLIC_ADMIN_TOKEN=$NEXT_PUBLIC_ADMIN_TOKEN
RUN npm run build
FROM node:22-alpine@sha256:cb15fca92530d7ac113467696cf1001208dac49c3c64355fd1348c11a88ddf8f
FROM node:22-alpine
WORKDIR /app
COPY --from=builder /app/.next/standalone ./
COPY --from=builder /app/.next/static ./.next/static

View File

@ -119,7 +119,6 @@
"integrity": "sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@babel/helper-validator-identifier": "^7.28.5",
"js-tokens": "^4.0.0",
@ -300,6 +299,7 @@
}
],
"license": "MIT",
"peer": true,
"engines": {
"node": ">=20.19.0"
},
@ -348,6 +348,7 @@
}
],
"license": "MIT",
"peer": true,
"engines": {
"node": ">=20.19.0"
}
@ -359,6 +360,7 @@
"dev": true,
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"@emnapi/wasi-threads": "1.2.1",
"tslib": "^2.4.0"
@ -370,6 +372,7 @@
"integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"tslib": "^2.4.0"
}
@ -1126,6 +1129,7 @@
"integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==",
"devOptional": true,
"license": "Apache-2.0",
"peer": true,
"dependencies": {
"playwright": "1.59.1"
},
@ -2406,8 +2410,7 @@
"resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz",
"integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==",
"dev": true,
"license": "MIT",
"peer": true
"license": "MIT"
},
"node_modules/@types/chai": {
"version": "5.2.3",
@ -2530,6 +2533,7 @@
"integrity": "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"undici-types": "~7.19.0"
}
@ -2539,6 +2543,7 @@
"resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.14.tgz",
"integrity": "sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w==",
"license": "MIT",
"peer": true,
"dependencies": {
"csstype": "^3.2.2"
}
@ -2549,6 +2554,7 @@
"integrity": "sha512-jp2L/eY6fn+KgVVQAOqYItbF0VY/YApe5Mz2F0aykSO8gx31bYCZyvSeYxCHKvzHG5eZjc+zyaS5BrBWya2+kQ==",
"devOptional": true,
"license": "MIT",
"peer": true,
"peerDependencies": {
"@types/react": "^19.2.0"
}
@ -2597,6 +2603,7 @@
"integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@bcoe/v8-coverage": "^1.0.2",
"@vitest/utils": "4.1.5",
@ -2807,7 +2814,6 @@
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
"dev": true,
"license": "MIT",
"peer": true,
"engines": {
"node": ">=8"
}
@ -2818,7 +2824,6 @@
"integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==",
"dev": true,
"license": "MIT",
"peer": true,
"engines": {
"node": ">=10"
},
@ -3111,6 +3116,7 @@
"resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz",
"integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
"license": "ISC",
"peer": true,
"engines": {
"node": ">=12"
}
@ -3253,8 +3259,7 @@
"resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz",
"integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==",
"dev": true,
"license": "MIT",
"peer": true
"license": "MIT"
},
"node_modules/enhanced-resolve": {
"version": "5.21.0",
@ -3600,8 +3605,7 @@
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
"integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
"dev": true,
"license": "MIT",
"peer": true
"license": "MIT"
},
"node_modules/jsdom": {
"version": "29.1.1",
@ -3609,6 +3613,7 @@
"integrity": "sha512-ECi4Fi2f7BdJtUKTflYRTiaMxIB0O6zfR1fX0GXpUrf6flp8QIYn1UT20YQqdSOfk2dfkCwS8LAFoJDEppNK5Q==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@asamuzakjp/css-color": "^5.1.11",
"@asamuzakjp/dom-selector": "^7.1.1",
@ -3931,7 +3936,6 @@
"integrity": "sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ==",
"dev": true,
"license": "MIT",
"peer": true,
"bin": {
"lz-string": "bin/bin.js"
}
@ -5006,6 +5010,7 @@
"integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
"dev": true,
"license": "MIT",
"peer": true,
"engines": {
"node": ">=12"
},
@ -5093,7 +5098,6 @@
"integrity": "sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"ansi-regex": "^5.0.1",
"ansi-styles": "^5.0.0",
@ -5128,6 +5132,7 @@
"resolved": "https://registry.npmjs.org/react/-/react-19.2.5.tgz",
"integrity": "sha512-llUJLzz1zTUBrskt2pwZgLq59AemifIftw4aB7JxOqf1HY2FDaGDxgwpAPVzHU1kdWabH7FauP4i1oEeer2WCA==",
"license": "MIT",
"peer": true,
"engines": {
"node": ">=0.10.0"
}
@ -5137,6 +5142,7 @@
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.5.tgz",
"integrity": "sha512-J5bAZz+DXMMwW/wV3xzKke59Af6CHY7G4uYLN1OvBcKEsWOs4pQExj86BBKamxl/Ik5bx9whOrvBlSDfWzgSag==",
"license": "MIT",
"peer": true,
"dependencies": {
"scheduler": "^0.27.0"
},
@ -5149,8 +5155,7 @@
"resolved": "https://registry.npmjs.org/react-is/-/react-is-17.0.2.tgz",
"integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
"dev": true,
"license": "MIT",
"peer": true
"license": "MIT"
},
"node_modules/react-markdown": {
"version": "10.1.0",
@ -5598,7 +5603,8 @@
"version": "4.2.4",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.2.4.tgz",
"integrity": "sha512-HhKppgO81FQof5m6TEnuBWCZGgfRAWbaeOaGT00KOy/Pf/j6oUihdvBpA7ltCeAvZpFhW3j0PTclkxsd4IXYDA==",
"license": "MIT"
"license": "MIT",
"peer": true
},
"node_modules/tapable": {
"version": "2.3.3",
@ -5940,6 +5946,7 @@
"integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"lightningcss": "^1.32.0",
"picomatch": "^4.0.4",
@ -6033,6 +6040,7 @@
"integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@vitest/expect": "4.1.5",
"@vitest/mocker": "4.1.5",

View File

@ -274,17 +274,4 @@ body {
.react-flow__node {
animation: none !important;
}
/* React Flow Controls toolbar buttons — WCAG 2.4.7 focus-visible */
.react-flow__controls button:focus-visible {
outline: 2px solid var(--accent, #3b5bdb);
outline-offset: 2px;
}
/* React Flow Minimap nodes — WCAG 2.4.7 focus-visible */
.react-flow__minimap:focus-visible,
.react-flow__minimap svg:focus-visible {
outline: 2px solid var(--accent, #3b5bdb);
outline-offset: 2px;
}
}

View File

@ -1,22 +1,6 @@
import type { Metadata } from "next";
import { Inter, JetBrains_Mono } from "next/font/google";
import { cookies, headers } from "next/headers";
import "./globals.css";
// Self-hosted at build time → CSP-safe (font-src 'self' covers them
// because Next.js serves the .woff2 from /_next/static). Exposed as
// CSS variables so the mobile palette can reference them without
// importing this module.
const interFont = Inter({
subsets: ["latin"],
display: "swap",
variable: "--font-inter",
});
const monoFont = JetBrains_Mono({
subsets: ["latin"],
display: "swap",
variable: "--font-jetbrains",
});
import { AuthGate } from "@/components/AuthGate";
import { CookieConsent } from "@/components/CookieConsent";
import { PurchaseSuccessModal } from "@/components/PurchaseSuccessModal";
@ -95,7 +79,7 @@ export default async function RootLayout({
dangerouslySetInnerHTML={{ __html: themeBootScript }}
/>
</head>
<body className={`bg-surface text-ink ${interFont.variable} ${monoFont.variable}`}>
<body className="bg-surface text-ink">
<ThemeProvider initialTheme={theme}>
{/* AuthGate is a client component; it checks the session on mount
and bounces anonymous users to the control plane's login page

View File

@ -354,7 +354,7 @@ function OrgCTA({ org }: { org: Org }) {
);
}
// provisioning / unknown — non-interactive
return <span className="text-sm text-ink-mid">{org.status}</span>;
return <span className="text-sm text-ink-soft">{org.status}</span>;
}
function EmptyState({ banner }: { banner?: React.ReactNode }) {
@ -420,7 +420,7 @@ function CreateOrgForm({ onCreated }: { onCreated: (slug: string) => void }) {
aria-describedby="org-slug-hint"
className="mt-1 w-full rounded border border-line bg-surface-card px-3 py-2 text-sm text-ink"
/>
<p id="org-slug-hint" className="mt-1 text-xs text-ink-mid">
<p id="org-slug-hint" className="mt-1 text-xs text-ink-soft">
Lowercase letters, numbers, and hyphens only. Cannot be changed later.
</p>
</div>

View File

@ -4,7 +4,6 @@ import { useEffect, useState } from "react";
import { Canvas } from "@/components/Canvas";
import { Legend } from "@/components/Legend";
import { CommunicationOverlay } from "@/components/CommunicationOverlay";
import { MobileApp } from "@/components/mobile/MobileApp";
import { Spinner } from "@/components/Spinner";
import { connectSocket, disconnectSocket } from "@/store/socket";
import { useCanvasStore } from "@/store/canvas";
@ -15,23 +14,6 @@ export default function Home() {
const hydrationError = useCanvasStore((s) => s.hydrationError);
const setHydrationError = useCanvasStore((s) => s.setHydrationError);
const [hydrating, setHydrating] = useState(true);
// < 640px viewport renders the dedicated mobile shell instead of the
// desktop canvas. Tri-state: `null` until matchMedia has resolved,
// then `true|false`. While null we keep the existing loading spinner
// up — that way mobile devices never flash the desktop tree (which
// they would if we defaulted to `false` and only flipped post-mount).
const [isMobile, setIsMobile] = useState<boolean | null>(null);
useEffect(() => {
if (typeof window === "undefined" || !window.matchMedia) {
setIsMobile(false);
return;
}
const mq = window.matchMedia("(max-width: 639px)");
const update = () => setIsMobile(mq.matches);
update();
mq.addEventListener("change", update);
return () => mq.removeEventListener("change", update);
}, []);
// Distinct from hydrationError: platform-down is its own UX path
// (different copy, different action — the user's next step is to
// check local services, not to retry the API call). Tracked
@ -69,15 +51,12 @@ export default function Home() {
};
}, []);
// Hold the spinner while data hydrates OR while the viewport
// resolution hasn't settled yet (avoids a desktop-tree flash on
// mobile devices between SSR-paint and matchMedia).
if (hydrating || isMobile === null) {
if (hydrating) {
return (
<div className="fixed inset-0 flex items-center justify-center bg-surface">
<div role="status" aria-live="polite" className="flex flex-col items-center gap-3">
<Spinner size="lg" />
<span className="text-xs text-ink-mid">Loading canvas...</span>
<span className="text-xs text-ink-soft">Loading canvas...</span>
</div>
</div>
);
@ -87,32 +66,6 @@ export default function Home() {
return <PlatformDownDiagnostic />;
}
if (isMobile) {
return (
<>
<MobileApp />
{hydrationError && (
<div
role="alert"
data-testid="hydration-error"
className="fixed inset-0 flex flex-col items-center justify-center bg-surface text-ink-mid gap-4 z-[9999] px-6"
>
<p className="text-ink-mid text-sm text-center">{hydrationError}</p>
<button
onClick={() => {
setHydrationError(null);
window.location.reload();
}}
className="px-4 py-2 bg-accent-strong hover:bg-accent text-white rounded-md text-sm"
>
Retry
</button>
</div>
)}
</>
);
}
return (
<>
<Canvas />
@ -166,11 +119,11 @@ function PlatformDownDiagnostic() {
Most common cause on a dev host: one of those services stopped.
</p>
<div className="bg-surface-sunken/80 border border-line/50 rounded-lg px-4 py-3 max-w-lg w-full">
<div className="text-[10px] uppercase tracking-wider text-ink-mid mb-2">Try first</div>
<div className="text-[10px] uppercase tracking-wider text-ink-soft mb-2">Try first</div>
<pre className="text-[12px] text-ink-mid font-mono whitespace-pre-wrap leading-relaxed">{`brew services start postgresql@14
brew services start redis`}</pre>
</div>
<p className="text-[11px] text-ink-mid max-w-lg text-center">
<p className="text-[11px] text-ink-soft max-w-lg text-center">
If both are running, check <code className="font-mono">/tmp/molecule-server.log</code> for
the underlying error. If you&apos;re on hosted SaaS, this is a platform incident try again in a moment.
</p>

View File

@ -55,13 +55,13 @@ export default function PricingPage() {
</a>
.
</p>
<p className="mt-6 text-sm text-ink-mid">
<p className="mt-6 text-sm text-ink-soft">
Prices shown in USD. Flat-rate per org no per-seat fees on any paid tier.
Enterprise / self-hosted licensing available contact us.
</p>
</section>
<footer className="mx-auto mt-20 max-w-5xl border-t border-line px-6 py-6 text-center text-sm text-ink-mid">
<footer className="mx-auto mt-20 max-w-5xl border-t border-line px-6 py-6 text-center text-sm text-ink-soft">
<p>
© {new Date().getFullYear()} Molecule AI, Inc. ·{" "}
<a href="/legal/terms" className="hover:text-ink-mid">

View File

@ -127,7 +127,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
if (loading) {
return (
<div className="flex items-center justify-center h-32">
<span className="text-xs text-ink-mid">Loading audit trail</span>
<span className="text-xs text-ink-soft">Loading audit trail</span>
</div>
);
}
@ -142,10 +142,10 @@ export function AuditTrailPanel({ workspaceId }: Props) {
key={f.id}
onClick={() => setFilter(f.id)}
aria-pressed={filter === f.id}
className={`px-2 py-1 text-[10px] rounded-md font-medium transition-all shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface ${
className={`px-2 py-1 text-[10px] rounded-md font-medium transition-all shrink-0 ${
filter === f.id
? "bg-surface-card text-ink ring-1 ring-zinc-600"
: "text-ink-mid hover:text-ink-mid hover:bg-surface-card/60"
: "text-ink-soft hover:text-ink-mid hover:bg-surface-card/60"
}`}
>
{f.label}
@ -155,7 +155,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
<button
type="button"
onClick={loadEntries}
className="px-2 py-1 text-[10px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
className="px-2 py-1 text-[10px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors shrink-0"
aria-label="Refresh audit trail"
>
@ -174,9 +174,9 @@ export function AuditTrailPanel({ workspaceId }: Props) {
{entries.length === 0 ? (
/* Empty state */
<div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
<span className="text-4xl text-ink-mid" aria-hidden="true"></span>
<span className="text-4xl text-ink-soft" aria-hidden="true"></span>
<p className="text-sm font-medium text-ink-mid">No audit events yet</p>
<p className="text-[11px] text-ink-mid max-w-[200px] leading-relaxed">
<p className="text-[11px] text-ink-soft max-w-[200px] leading-relaxed">
Delegation, decision, gate, and human-in-the-loop events will appear here.
</p>
</div>
@ -195,7 +195,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
type="button"
onClick={loadMore}
disabled={loadingMore}
className="px-4 py-2 text-[11px] bg-surface-card hover:bg-surface-card disabled:opacity-50 disabled:cursor-not-allowed text-ink-mid rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
className="px-4 py-2 text-[11px] bg-surface-card hover:bg-surface-card disabled:opacity-50 disabled:cursor-not-allowed text-ink-mid rounded-lg transition-colors"
>
{loadingMore ? "Loading…" : "Load more"}
</button>
@ -203,7 +203,7 @@ export function AuditTrailPanel({ workspaceId }: Props) {
)}
{/* Entry count footer */}
<p className="mt-3 text-center text-[9px] text-ink-mid">
<p className="mt-3 text-center text-[9px] text-ink-soft">
{entries.length} event{entries.length !== 1 ? "s" : ""} loaded
{cursor ? " · more available" : " · all loaded"}
</p>
@ -265,7 +265,7 @@ export function AuditEntryRow({ entry, now }: AuditEntryRowProps) {
)}
{/* Relative timestamp */}
<span className="shrink-0 text-[9px] text-ink-mid">
<span className="shrink-0 text-[9px] text-ink-soft">
{formatAuditRelativeTime(entry.created_at, now)}
</span>
</div>

View File

@ -43,9 +43,7 @@ export function BundleDropZone() {
const handleDragOver = useCallback((e: React.DragEvent) => {
e.preventDefault();
e.stopPropagation();
// Guard against jsdom (no File API / dataTransfer.types) and other
// environments where dataTransfer may be null/undefined.
if (e.dataTransfer?.types?.includes("Files")) {
if (e.dataTransfer.types.includes("Files")) {
setIsDragging(true);
}
}, []);
@ -60,7 +58,6 @@ export function BundleDropZone() {
e.preventDefault();
e.stopPropagation();
setIsDragging(false);
if (!e.dataTransfer?.files?.length) return;
const file = Array.from(e.dataTransfer.files).find(
(f) => f.name.endsWith(".bundle.json")
);
@ -128,7 +125,7 @@ export function BundleDropZone() {
<div className="bg-surface-sunken/95 border border-accent/50 rounded-2xl px-8 py-6 shadow-2xl text-center">
<div className="text-3xl mb-2" aria-hidden="true">📦</div>
<div className="text-sm font-semibold text-ink">Drop Bundle to Import</div>
<div className="text-xs text-ink-mid mt-1">.bundle.json files only</div>
<div className="text-xs text-ink-soft mt-1">.bundle.json files only</div>
</div>
</div>
)}

View File

@ -1,6 +1,6 @@
"use client";
import { useCallback, useEffect, useMemo, useRef } from "react";
import { useCallback, useMemo } from "react";
import {
ReactFlow,
ReactFlowProvider,
@ -187,23 +187,6 @@ function CanvasInner() {
// Pan-to-node / zoom-to-team CustomEvent listeners + viewport save.
const { onMoveEnd } = useCanvasViewport();
// Screen-reader announcements — read liveAnnouncement from the store and
// immediately clear it so the same announcement doesn't re-fire on
// re-render. Using a ref avoids a setState loop while keeping the
// effect reactive to new announcement strings.
const liveAnnouncement = useCanvasStore((s) => s.liveAnnouncement);
const clearAnnouncement = useCanvasStore((s) => s.setLiveAnnouncement);
const prevAnnouncement = useRef("");
useEffect(() => {
if (liveAnnouncement && liveAnnouncement !== prevAnnouncement.current) {
prevAnnouncement.current = liveAnnouncement;
// Small delay so the DOM update lands before clearing, giving
// screen readers time to pick up the new text.
const timer = setTimeout(() => clearAnnouncement(""), 500);
return () => clearTimeout(timer);
}
}, [liveAnnouncement, clearAnnouncement]);
// Delete-confirmation lives in the store so the dialog survives ContextMenu
// unmounting — the prior local-in-ContextMenu state raced with the menu's
// outside-click handler.
@ -308,9 +291,7 @@ function CanvasInner() {
showInteractive={false}
/>
<MiniMap
// hidden < sm: minimap eats ~30% of a phone screen and
// overlaps with the New Workspace FAB at bottom-right.
className="!bg-surface-sunken/90 !border-line/50 !rounded-lg !shadow-xl !shadow-black/20 !hidden sm:!block"
className="!bg-surface-sunken/90 !border-line/50 !rounded-lg !shadow-xl !shadow-black/20"
// Mask dims off-viewport areas; tint matches the surface so
// the dimming doesn't show as a black bar in light mode.
maskColor={resolvedTheme === "dark" ? "rgba(0, 0, 0, 0.7)" : "rgba(232, 226, 211, 0.7)"}
@ -345,21 +326,11 @@ function CanvasInner() {
<DropTargetBadge />
</ReactFlow>
{/* Screen-reader live region announces workspace count on initial load and
live status updates from WebSocket events (online, offline, provisioning, etc.).
The liveAnnouncement text is cleared after the screen reader has had time
to read it so the same message doesn't re-announce on re-render. */}
<div
role="status"
aria-live="polite"
aria-atomic="true"
className="sr-only"
>
{liveAnnouncement || (
nodes.filter((n) => !n.parentId).length === 0
? "No workspaces on canvas"
: `${nodes.filter((n) => !n.parentId).length} workspace${nodes.filter((n) => !n.parentId).length !== 1 ? "s" : ""} on canvas`
)}
{/* Screen-reader live region: announces workspace count on canvas load or change */}
<div role="status" aria-live="polite" className="sr-only">
{nodes.filter((n) => !n.parentId).length === 0
? "No workspaces on canvas"
: `${nodes.filter((n) => !n.parentId).length} workspace${nodes.filter((n) => !n.parentId).length !== 1 ? "s" : ""} on canvas`}
</div>
{nodes.length === 0 && <EmptyState />}

View File

@ -209,7 +209,7 @@ export function CommunicationOverlay() {
type="button"
onClick={() => setVisible(true)}
aria-label="Show communications panel"
className="fixed top-16 right-4 z-30 px-3 py-1.5 bg-surface-sunken/90 border border-line/50 rounded-lg text-[10px] text-ink-mid hover:text-ink transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
className="fixed top-16 right-4 z-30 px-3 py-1.5 bg-surface-sunken/90 border border-line/50 rounded-lg text-[10px] text-ink-mid hover:text-ink transition-colors"
>
<span aria-hidden="true"> </span>{comms.length > 0 ? `${comms.length} comms` : "Communications"}
</button>
@ -226,7 +226,7 @@ export function CommunicationOverlay() {
type="button"
onClick={() => setVisible(false)}
aria-label="Close communications panel"
className="text-ink-mid hover:text-ink-mid text-xs focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
className="text-ink-soft hover:text-ink-mid text-xs"
>
<span aria-hidden="true"></span>
</button>
@ -268,7 +268,7 @@ export function CommunicationOverlay() {
</div>
</div>
{c.summary && (
<div className="text-ink-mid truncate mt-0.5 pl-4">{c.summary}</div>
<div className="text-ink-soft truncate mt-0.5 pl-4">{c.summary}</div>
)}
{c.durationMs && (
<div className="text-ink-mid pl-4">{c.durationMs}ms</div>

View File

@ -105,12 +105,8 @@ export function ConfirmDialog({
// (e.g. parents with transform, filter, will-change that break position:fixed).
return createPortal(
<div className="fixed inset-0 z-[9999] flex items-center justify-center">
{/* Backdrop — interactive dismiss area; accessible name for screen readers (WCAG 4.1.2) */}
<div
className="absolute inset-0 bg-black/60 backdrop-blur-sm cursor-pointer"
aria-label="Dismiss dialog"
onClick={onCancel}
/>
{/* Backdrop */}
<div className="absolute inset-0 bg-black/60 backdrop-blur-sm" onClick={onCancel} />
{/* Dialog — role="dialog" + aria-modal prevent interaction with background */}
<div

View File

@ -90,11 +90,7 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
return createPortal(
<div className="fixed inset-0 z-[9999] flex items-center justify-center">
<div
className="absolute inset-0 bg-black/70 backdrop-blur-sm cursor-pointer"
onClick={onClose}
aria-label="Close terminal"
/>
<div aria-hidden="true" className="absolute inset-0 bg-black/70 backdrop-blur-sm" onClick={onClose} />
<div
role="dialog"
aria-modal="true"
@ -107,7 +103,7 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
EC2 console output
</h3>
{workspaceName && (
<div className="text-[11px] text-ink-mid mt-0.5 truncate max-w-[600px]">
<div className="text-[11px] text-ink-soft mt-0.5 truncate max-w-[600px]">
{workspaceName}
</div>
)}
@ -128,7 +124,7 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
<div className="flex-1 overflow-auto bg-black/80 p-4">
{loading && (
<div className="text-[12px] text-ink-mid" data-testid="console-loading">
<div className="text-[12px] text-ink-soft" data-testid="console-loading">
Loading console output
</div>
)}
@ -169,7 +165,7 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop
showToast("Copy requires HTTPS — please select and copy manually", "info");
}
}}
className="px-3 py-1.5 text-[11px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-elevated border border-line hover:border-line-soft rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-3 py-1.5 text-[11px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-elevated border border-line hover:border-line-soft rounded-lg transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 focus-visible:ring-offset-2 focus-visible:ring-offset-surface"
>
Copy
</button>

View File

@ -311,7 +311,7 @@ export function ContextMenu() {
aria-hidden="true"
className={`w-1.5 h-1.5 rounded-full ${statusDotClass(contextMenu.nodeData.status)}`}
/>
<span className="text-[10px] text-ink-mid">{contextMenu.nodeData.status}</span>
<span className="text-[10px] text-ink-soft">{contextMenu.nodeData.status}</span>
</div>
</div>

View File

@ -13,8 +13,7 @@ interface Props {
onClose: () => void;
}
/** Exported for unit testing — see ConversationTraceModal.test.ts */
export function extractMessageText(body: Record<string, unknown> | null): string {
function extractMessageText(body: Record<string, unknown> | null): string {
if (!body) return "";
try {
// Simple task format from MCP server: {task: "..."}
@ -107,7 +106,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
<Dialog.Title className="text-sm font-semibold text-ink">
Conversation Trace
</Dialog.Title>
<p className="text-[10px] text-ink-mid mt-0.5">
<p className="text-[10px] text-ink-soft mt-0.5">
{entries.length} events across all workspaces
</p>
</div>
@ -115,7 +114,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
<button
type="button"
aria-label="Close conversation trace"
className="text-ink-mid hover:text-ink-mid text-lg px-2 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
className="text-ink-soft hover:text-ink-mid text-lg px-2"
>
</button>
@ -125,13 +124,13 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
{/* Timeline */}
<div className="flex-1 overflow-y-auto px-5 py-4">
{loading && (
<div className="text-xs text-ink-mid text-center py-8">
<div className="text-xs text-ink-soft text-center py-8">
Loading trace from all workspaces...
</div>
)}
{!loading && entries.length === 0 && (
<div className="text-xs text-ink-mid text-center py-8">
<div className="text-xs text-ink-soft text-center py-8">
No activity found
</div>
)}
@ -251,7 +250,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
{/* Message content — show request and/or response */}
{requestText && (
<div className="mt-1.5 bg-surface/60 border border-line/50 rounded-lg px-3 py-2 max-h-32 overflow-y-auto">
<div className="text-[8px] text-ink-mid uppercase mb-1">
<div className="text-[8px] text-ink-soft uppercase mb-1">
{isSend ? "Task" : "Request"}
</div>
<div className="text-[10px] text-ink-mid whitespace-pre-wrap break-words leading-relaxed">
@ -286,7 +285,7 @@ export function ConversationTraceModal({ open, workspaceId: _workspaceId, onClos
<Dialog.Close asChild>
<button
type="button"
className="px-4 py-1.5 text-[12px] bg-surface-card hover:bg-surface-card text-ink-mid rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
className="px-4 py-1.5 text-[12px] bg-surface-card hover:bg-surface-card text-ink-mid rounded-lg transition-colors"
>
Close
</button>

View File

@ -338,7 +338,7 @@ export function CreateWorkspaceButton() {
<Dialog.Title className="text-base font-semibold text-ink mb-1">
Create Workspace
</Dialog.Title>
<p className="text-xs text-ink-mid mb-5">
<p className="text-xs text-ink-soft mb-5">
Add a new workspace node to the canvas
</p>
@ -376,7 +376,7 @@ export function CreateWorkspaceButton() {
/>
<div className="text-xs">
<div className="text-ink font-medium">External agent (bring your own compute)</div>
<div className="text-ink-mid mt-0.5">
<div className="text-ink-soft mt-0.5">
Skip the container. We&apos;ll return a workspace_id + auth token + ready-to-paste snippet so an agent running on your laptop / server / CI can register via A2A.
</div>
</div>
@ -411,7 +411,7 @@ export function CreateWorkspaceButton() {
tabIndex={tier === t.value ? 0 : -1}
onClick={() => setTier(t.value)}
onKeyDown={(e) => handleRadioKeyDown(e, idx)}
className={`py-2 rounded-lg text-center transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 ${
className={`py-2 rounded-lg text-center transition-colors ${
tier === t.value
? "bg-accent-strong/20 border border-accent/50 text-accent"
: "bg-surface-card/60 border border-line/40 text-ink-mid hover:text-ink-mid hover:border-line"
@ -456,7 +456,7 @@ export function CreateWorkspaceButton() {
<p className="text-[11px] font-semibold text-violet-400 uppercase tracking-wide">
Hermes Provider
</p>
<p className="text-[11px] text-ink-mid -mt-1">
<p className="text-[11px] text-ink-soft -mt-1">
Choose the AI provider and paste your API key. The key is
stored as an encrypted workspace secret.
</p>
@ -534,7 +534,7 @@ export function CreateWorkspaceButton() {
(m) => <option key={m} value={m} />,
)}
</datalist>
<p className="text-[10px] text-ink-mid mt-1">
<p className="text-[10px] text-ink-soft mt-1">
Slug determines which provider hermes routes to at install time.
</p>
</div>
@ -626,7 +626,7 @@ function InputField({
className={`w-full bg-surface-card/60 border border-line/50 rounded-lg px-3 py-2 text-sm text-ink placeholder-ink-soft focus:outline-none focus:border-accent/60 focus:ring-1 focus:ring-accent/20 transition-colors ${mono ? "font-mono text-xs" : ""}`}
/>
{helper && (
<p className="mt-1 text-xs text-ink-mid">{helper}</p>
<p className="mt-1 text-xs text-ink-soft">{helper}</p>
)}
</div>
);

View File

@ -81,11 +81,7 @@ export function DeleteCascadeConfirmDialog({
return createPortal(
<div className="fixed inset-0 z-[9999] flex items-center justify-center">
{/* Backdrop */}
<div
className="absolute inset-0 bg-black/60 backdrop-blur-sm cursor-pointer"
onClick={onCancel}
aria-label="Dismiss dialog"
/>
<div aria-hidden="true" className="absolute inset-0 bg-black/60 backdrop-blur-sm" onClick={onCancel} />
{/* Dialog */}
<div

View File

@ -129,11 +129,11 @@ export function EmptyState() {
T{t.tier}
</span>
</div>
<p className="text-[11px] text-ink-mid line-clamp-2 leading-relaxed">
<p className="text-[11px] text-ink-soft line-clamp-2 leading-relaxed">
{t.description || "No description"}
</p>
{t.skill_count > 0 && (
<p className="text-[9px] text-ink-mid mt-1.5">
<p className="text-[9px] text-ink-soft mt-1.5">
{t.skill_count} skill{t.skill_count !== 1 ? "s" : ""}
{t.model ? ` · ${t.model}` : ""}
</p>
@ -174,10 +174,10 @@ export function EmptyState() {
<div className="mt-5 pt-4 border-t border-line/50">
<div className="flex items-center justify-center gap-6 text-[10px] text-ink-mid">
<span>Drag to nest workspaces into teams</span>
<span className="text-ink-mid">|</span>
<span className="text-ink-soft">|</span>
<span>Right-click for actions</span>
<span className="text-ink-mid">|</span>
<span>Press <kbd className="px-1 py-0.5 bg-surface-card rounded text-ink-mid font-mono">&#8984;K</kbd> to search</span>
<span className="text-ink-soft">|</span>
<span>Press <kbd className="px-1 py-0.5 bg-surface-card rounded text-ink-soft font-mono">&#8984;K</kbd> to search</span>
</div>
</div>
</div>

View File

@ -83,7 +83,7 @@ export class ErrorBoundary extends React.Component<
<button
type="button"
onClick={this.handleReload}
className="rounded-lg bg-accent-strong hover:bg-accent px-5 py-2 text-sm font-medium text-white transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-2 focus-visible:ring-offset-surface"
className="rounded-lg bg-accent-strong hover:bg-accent px-5 py-2 text-sm font-medium text-white transition-colors"
>
Reload
</button>
@ -93,7 +93,7 @@ export class ErrorBoundary extends React.Component<
e.preventDefault();
this.handleReport();
}}
className="rounded-lg border border-line hover:border-line px-5 py-2 text-sm font-medium text-ink-mid hover:text-ink transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-2 focus-visible:ring-offset-surface"
className="rounded-lg border border-line hover:border-line px-5 py-2 text-sm font-medium text-ink-mid hover:text-ink transition-colors"
>
Report
</a>

View File

@ -198,10 +198,10 @@ export function ExternalConnectModal({ info, onClose }: Props) {
role="tab"
aria-selected={tab === t}
onClick={() => setTab(t)}
className={`px-3 py-2 text-sm border-b-2 -mb-px transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface ${
className={`px-3 py-2 text-sm border-b-2 -mb-px transition-colors ${
tab === t
? "border-accent text-ink"
: "border-transparent text-ink-mid hover:text-ink-mid"
: "border-transparent text-ink-soft hover:text-ink-mid"
}`}
>
{t === "claude"
@ -309,7 +309,7 @@ export function ExternalConnectModal({ info, onClose }: Props) {
<button
type="button"
onClick={onClose}
className="px-4 py-2 text-sm rounded-lg bg-surface-card hover:bg-surface-card text-ink focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface"
className="px-4 py-2 text-sm rounded-lg bg-surface-card hover:bg-surface-card text-ink"
>
I&apos;ve saved it close
</button>
@ -335,11 +335,11 @@ function SnippetBlock({
return (
<div>
<div className="flex items-center justify-between pb-1">
<span className="text-xs text-ink-mid">{label}</span>
<span className="text-xs text-ink-soft">{label}</span>
<button
type="button"
onClick={onCopy}
className="text-xs px-2 py-1 rounded bg-accent-strong/80 hover:bg-accent text-white focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="text-xs px-2 py-1 rounded bg-accent-strong/80 hover:bg-accent text-white"
>
{copied ? "Copied!" : "Copy"}
</button>
@ -366,7 +366,7 @@ function Field({
}) {
return (
<div className="flex items-center gap-2">
<span className="text-xs text-ink-mid w-36 shrink-0">{label}</span>
<span className="text-xs text-ink-soft w-36 shrink-0">{label}</span>
<code
className={`flex-1 text-xs bg-surface border border-line rounded px-2 py-1 text-ink break-all ${mono ? "font-mono" : ""}`}
>
@ -376,7 +376,7 @@ function Field({
type="button"
onClick={onCopy}
disabled={!value}
className="text-xs px-2 py-1 rounded bg-surface-card hover:bg-surface-card text-ink disabled:opacity-40 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="text-xs px-2 py-1 rounded bg-surface-card hover:bg-surface-card text-ink disabled:opacity-40"
>
{copied ? "Copied!" : "Copy"}
</button>

View File

@ -1,236 +0,0 @@
"use client";
import { useEffect, useRef, useState } from "react";
import { createPortal } from "react-dom";
interface ShortcutGroup {
title: string;
shortcuts: Array<{ keys: string[]; description: string }>;
}
const SHORTCUT_GROUPS: ShortcutGroup[] = [
{
title: "Canvas",
shortcuts: [
{
keys: ["Esc"],
description: "Close context menu, clear selection, or deselect",
},
{
keys: ["↑↓←→"],
description: "Nudge selected node 10px; hold Shift for 50px",
},
{
keys: ["Cmd", "↑↓←→"],
description: "Resize selected node (↑↓ height, ←→ width); hold Shift for fine control (2px)",
},
{
keys: ["Enter"],
description: "Descend into selected node's first child",
},
{
keys: ["Shift", "Enter"],
description: "Ascend to selected node's parent",
},
{
keys: ["Cmd", "]"],
description: "Bring selected node forward in z-order",
},
{
keys: ["Cmd", "["],
description: "Send selected node backward in z-order",
},
{
keys: ["Z"],
description: "Zoom to fit the selected team and its sub-workspaces",
},
],
},
{
title: "Navigation",
shortcuts: [
{
keys: ["⌘K"],
description: "Open workspace search",
},
{
keys: ["Palette"],
description: "Open the template palette to deploy a new workspace",
},
{
keys: ["Dbl-click"],
description: "Zoom canvas to fit a team node and all its sub-workspaces",
},
{
keys: ["Right-click"],
description: "Open the workspace context menu",
},
],
},
{
title: "Agent",
shortcuts: [
{
keys: ["Chat"],
description: "Send a message or resume a running task",
},
{
keys: ["Config"],
description: "Edit skills, model, secrets, and runtime settings",
},
{
keys: ["Audit"],
description: "View the activity ledger for the selected workspace",
},
],
},
];
interface Props {
open: boolean;
onClose: () => void;
}
export function KeyboardShortcutsDialog({ open, onClose }: Props) {
const dialogRef = useRef<HTMLDivElement>(null);
const [mounted, setMounted] = useState(false);
useEffect(() => {
setMounted(true);
}, []);
// Move focus into the dialog when it opens (WCAG 2.1 SC 2.4.3)
useEffect(() => {
if (!open || !mounted) return;
const raf = requestAnimationFrame(() => {
dialogRef.current?.querySelector<HTMLElement>("button")?.focus();
});
return () => cancelAnimationFrame(raf);
}, [open, mounted]);
// Keyboard: Escape closes, Tab is trapped
useEffect(() => {
if (!open) return;
const handler = (e: KeyboardEvent) => {
if (e.key === "Escape") {
onClose();
return;
}
if (e.key === "Tab" && dialogRef.current) {
const focusable = Array.from(
dialogRef.current.querySelectorAll<HTMLElement>(
'button, [href], input, select, textarea, [tabindex]:not([tabindex="-1"])'
)
).filter((el) => !el.hasAttribute("disabled"));
if (focusable.length === 0) {
e.preventDefault();
return;
}
const first = focusable[0];
const last = focusable[focusable.length - 1];
if (e.shiftKey) {
if (document.activeElement === first) {
e.preventDefault();
last.focus();
}
} else {
if (document.activeElement === last) {
e.preventDefault();
first.focus();
}
}
}
};
window.addEventListener("keydown", handler);
return () => window.removeEventListener("keydown", handler);
}, [open, onClose]);
if (!open || !mounted) return null;
return createPortal(
<div className="fixed inset-0 z-[9999] flex items-center justify-center">
{/* Backdrop */}
<div
className="absolute inset-0 bg-black/60 backdrop-blur-sm cursor-pointer"
onClick={onClose}
aria-label="Close keyboard shortcuts dialog"
/>
{/* Dialog */}
<div
ref={dialogRef}
role="dialog"
aria-modal="true"
aria-labelledby="keyboard-shortcuts-title"
className="relative bg-surface border border-line rounded-xl shadow-2xl shadow-black/60 max-w-[480px] w-full mx-4 overflow-hidden max-h-[80vh] flex flex-col"
>
{/* Header */}
<div className="flex items-center justify-between px-5 py-4 border-b border-line shrink-0">
<h2
id="keyboard-shortcuts-title"
className="text-sm font-semibold text-ink"
>
Keyboard Shortcuts
</h2>
<button
type="button"
onClick={onClose}
aria-label="Close keyboard shortcuts"
className="w-7 h-7 flex items-center justify-center rounded-lg text-ink-mid hover:text-ink hover:bg-surface-sunken transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/40"
>
×
</button>
</div>
{/* Content */}
<div className="overflow-y-auto p-5 space-y-5">
{SHORTCUT_GROUPS.map((group) => (
<div key={group.title}>
<h3 className="text-[10px] font-semibold uppercase tracking-[0.2em] text-ink-mid mb-2.5">
{group.title}
</h3>
<div className="space-y-2">
{group.shortcuts.map((shortcut, i) => (
<div
key={i}
className="flex items-center justify-between gap-4"
>
<span className="text-[13px] text-ink-mid">
{shortcut.description}
</span>
<kbd className="flex items-center gap-0.5 shrink-0">
{shortcut.keys.map((k, j) => (
<span key={j} className="flex items-center gap-0.5">
{j > 0 && (
<span className="text-[9px] text-ink-mid mx-0.5">
+
</span>
)}
<span className="inline-flex items-center rounded-md border border-line/70 bg-surface-sunken/70 px-2 py-0.5 text-[11px] font-medium text-ink tabular-nums font-mono">
{k}
</span>
</span>
))}
</kbd>
</div>
))}
</div>
</div>
))}
</div>
{/* Footer */}
<div className="px-5 py-3 border-t border-line bg-surface-sunken/30 shrink-0">
<p className="text-[10px] text-ink-mid text-center">
Press{" "}
<kbd className="inline-flex items-center rounded border border-line/70 bg-surface-sunken/70 px-1.5 py-0.5 text-[10px] font-medium text-ink font-mono">
Esc
</kbd>{" "}
to close
</p>
</div>
</div>
</div>,
document.body
);
}

View File

@ -77,7 +77,7 @@ export function Legend() {
onClick={openLegend}
aria-label="Show legend"
title="Show legend"
className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-surface-sunken/95 border border-line/50 px-3 py-1.5 text-[11px] font-semibold text-ink-mid uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-ink hover:border-line focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-2 focus-visible:ring-offset-surface transition-[left,colors] duration-200`}
className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-surface-sunken/95 border border-line/50 px-3 py-1.5 text-[11px] font-semibold text-ink-mid uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-ink hover:border-line focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 focus-visible:ring-offset-2 focus-visible:ring-offset-surface transition-[left,colors] duration-200`}
>
<span aria-hidden="true" className="text-[10px]"></span>
Legend
@ -86,10 +86,7 @@ export function Legend() {
}
return (
<div
data-testid="legend-panel"
className={`fixed bottom-6 ${leftClass} z-30 bg-surface-sunken/95 border border-line/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}
>
<div className={`fixed bottom-6 ${leftClass} z-30 bg-surface-sunken/95 border border-line/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}>
<div className="flex items-start justify-between mb-2">
<div className="text-[11px] font-semibold text-ink-mid uppercase tracking-wider">Legend</div>
<button
@ -100,7 +97,7 @@ export function Legend() {
// 24×24 touch target (was ~10×16, well under WCAG 2.5.5 min).
// Negative margin keeps the visual position the same as before
// — only the hit area + focus ring are larger.
className="-mt-1.5 -mr-1.5 w-6 h-6 inline-flex items-center justify-center rounded text-[14px] leading-none text-ink-mid hover:text-ink hover:bg-surface-card/40 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 transition-colors"
className="-mt-1.5 -mr-1.5 w-6 h-6 inline-flex items-center justify-center rounded text-[14px] leading-none text-ink-soft hover:text-ink hover:bg-surface-card/40 focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 transition-colors"
>
×
</button>
@ -108,7 +105,7 @@ export function Legend() {
{/* Status */}
<div className="mb-2">
<div className="text-[11px] text-ink-mid font-medium mb-1">Status</div>
<div className="text-[11px] text-ink-soft font-medium mb-1">Status</div>
<div className="flex flex-wrap gap-x-3 gap-y-1">
{LEGEND_STATUSES.map((s) => (
<StatusItem key={s} color={STATUS_CONFIG[s].dot} label={STATUS_CONFIG[s].label} />
@ -118,7 +115,7 @@ export function Legend() {
{/* Tiers */}
<div className="mb-2">
<div className="text-[11px] text-ink-mid font-medium mb-1">Tier</div>
<div className="text-[11px] text-ink-soft font-medium mb-1">Tier</div>
<div className="flex flex-wrap gap-x-3 gap-y-1">
{LEGEND_TIERS.map(({ tier, label }) => (
<TierItem key={tier} tier={tier} label={label} color={TIER_CONFIG[tier].border} />
@ -128,7 +125,7 @@ export function Legend() {
{/* Communication */}
<div>
<div className="text-[11px] text-ink-mid font-medium mb-1">Communication</div>
<div className="text-[11px] text-ink-soft font-medium mb-1">Communication</div>
<div className="flex flex-wrap gap-x-3 gap-y-1">
<CommItem icon="↗" color="text-cyan-400" label="A2A Out" />
<CommItem icon="↙" color="text-accent" label="A2A In" />

View File

@ -288,7 +288,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
if (loading && entries.length === 0 && !error && !pluginUnavailable) {
return (
<div className="flex items-center justify-center h-32">
<span className="text-xs text-ink-mid">Loading memories</span>
<span className="text-xs text-ink-soft">Loading memories</span>
</div>
);
}
@ -311,7 +311,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
{/* Namespace dropdown */}
<div className="px-4 pt-3 pb-2 border-b border-line/40 shrink-0 space-y-2">
<div className="flex items-center gap-2">
<label htmlFor="namespace-dropdown" className="text-[10px] text-ink-mid shrink-0">
<label htmlFor="namespace-dropdown" className="text-[10px] text-ink-soft shrink-0">
Namespace:
</label>
<select
@ -337,7 +337,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
height="12"
viewBox="0 0 16 16"
fill="none"
className="absolute left-2.5 text-ink-mid pointer-events-none shrink-0"
className="absolute left-2.5 text-ink-soft pointer-events-none shrink-0"
aria-hidden="true"
>
<circle cx="7" cy="7" r="4.5" stroke="currentColor" strokeWidth="1.5" />
@ -360,7 +360,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
setDebouncedQuery('');
}}
aria-label="Clear search"
className="absolute right-2 text-ink-mid hover:text-ink transition-colors text-sm leading-none focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="absolute right-2 text-ink-soft hover:text-ink transition-colors text-sm leading-none"
>
×
</button>
@ -370,7 +370,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
{/* Toolbar */}
<div className="px-4 py-2.5 border-b border-line/40 flex items-center justify-between shrink-0">
<span className="text-[11px] text-ink-mid">
<span className="text-[11px] text-ink-soft">
{debouncedQuery
? `${entries.length} result${entries.length !== 1 ? 's' : ''}`
: entries.length === 1
@ -381,7 +381,7 @@ export function MemoryInspectorPanel({ workspaceId }: Props) {
type="button"
onClick={loadEntries}
disabled={pluginUnavailable}
className="px-2 py-1 text-[11px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors disabled:opacity-50 disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-2 py-1 text-[11px] bg-surface-card hover:bg-surface-card text-ink-mid rounded transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
aria-label="Refresh memories"
>
Refresh
@ -446,11 +446,11 @@ function EmptyState({
// mirror it so the operator sees both signals.
return (
<div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
<span className="text-4xl text-ink-mid" aria-hidden="true">
<span className="text-4xl text-ink-soft" aria-hidden="true">
</span>
<p className="text-sm font-medium text-ink-mid">Memory plugin disabled</p>
<p className="text-[11px] text-ink-mid max-w-[220px] leading-relaxed">
<p className="text-[11px] text-ink-soft max-w-[220px] leading-relaxed">
See banner above for the operator-side fix.
</p>
</div>
@ -459,11 +459,11 @@ function EmptyState({
if (query) {
return (
<div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
<span className="text-4xl text-ink-mid" aria-hidden="true">
<span className="text-4xl text-ink-soft" aria-hidden="true">
</span>
<p className="text-sm font-medium text-ink-mid">No memories match your search</p>
<p className="text-[11px] text-ink-mid max-w-[200px] leading-relaxed">
<p className="text-[11px] text-ink-soft max-w-[200px] leading-relaxed">
Try a different query or clear the search.
</p>
</div>
@ -471,11 +471,11 @@ function EmptyState({
}
return (
<div className="flex flex-col items-center justify-center py-16 gap-3 text-center">
<span className="text-4xl text-ink-mid" aria-hidden="true">
<span className="text-4xl text-ink-soft" aria-hidden="true">
</span>
<p className="text-sm font-medium text-ink-mid">No memories yet</p>
<p className="text-[11px] text-ink-mid max-w-[220px] leading-relaxed">
<p className="text-[11px] text-ink-soft max-w-[220px] leading-relaxed">
Agents commit memories via MCP tools (commit_memory, commit_summary). They
appear here once written.
</p>
@ -515,7 +515,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
{/* Header row */}
<button
type="button"
className="w-full flex items-center gap-2 px-3 py-2.5 text-left hover:bg-surface-card/30 transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="w-full flex items-center gap-2 px-3 py-2.5 text-left hover:bg-surface-card/30 transition-colors"
onClick={() => setExpanded((prev) => !prev)}
aria-expanded={expanded}
aria-controls={bodyId}
@ -558,7 +558,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
{/* Namespace tag */}
<span
className="text-[9px] shrink-0 font-mono text-ink-mid truncate max-w-[100px]"
className="text-[9px] shrink-0 font-mono text-ink-soft truncate max-w-[100px]"
title={entry.namespace}
>
{entry.namespace}
@ -598,10 +598,10 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
)}
<span className="text-[9px] text-ink-mid shrink-0">
<span className="text-[9px] text-ink-soft shrink-0">
{formatRelativeTime(entry.created_at)}
</span>
<span className="text-[9px] text-ink-mid shrink-0" aria-hidden="true">
<span className="text-[9px] text-ink-soft shrink-0" aria-hidden="true">
{expanded ? '▼' : '▶'}
</span>
</button>
@ -618,7 +618,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
{entry.content}
</pre>
<div className="flex items-center justify-between gap-2">
<span className="text-[9px] text-ink-mid">
<span className="text-[9px] text-ink-soft">
Created: {new Date(entry.created_at).toLocaleString()}
{entry.expires_at && ` · Expires: ${new Date(entry.expires_at).toLocaleString()}`}
</span>
@ -629,7 +629,7 @@ function MemoryEntryRow({ entry, onDelete }: MemoryEntryRowProps) {
onDelete();
}}
aria-label="Forget memory"
className="text-[10px] px-2 py-0.5 bg-red-950/40 hover:bg-red-900/50 border border-red-900/30 rounded text-bad transition-colors shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-red-400 focus-visible:ring-offset-1"
className="text-[10px] px-2 py-0.5 bg-red-950/40 hover:bg-red-900/50 border border-red-900/30 rounded text-bad transition-colors shrink-0"
>
Forget
</button>

View File

@ -421,7 +421,7 @@ function ProviderPickerModal({
<div className="text-[11px] text-ink-mid font-medium">
{getKeyLabel(entry.key)}
</div>
<div className="text-[9px] font-mono text-ink-mid">{entry.key}</div>
<div className="text-[9px] font-mono text-ink-soft">{entry.key}</div>
</div>
{entry.saved && (
<span className="text-[9px] text-good bg-emerald-900/30 px-1.5 py-0.5 rounded flex items-center gap-1">
@ -675,7 +675,7 @@ function AllKeysModal({
<div className="text-[11px] text-ink-mid font-medium">
{getKeyLabel(entry.key)}
</div>
<div className="text-[9px] font-mono text-ink-mid">{entry.key}</div>
<div className="text-[9px] font-mono text-ink-soft">{entry.key}</div>
</div>
{entry.saved && (
<span className="text-[9px] text-good bg-emerald-900/30 px-1.5 py-0.5 rounded flex items-center gap-1">
@ -706,7 +706,7 @@ function AllKeysModal({
type="button"
onClick={() => handleSaveKey(index)}
disabled={!entry.value.trim() || entry.saving}
className="px-3 py-1.5 bg-accent-strong hover:bg-accent text-[11px] rounded text-white disabled:opacity-30 transition-colors shrink-0 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-3 py-1.5 bg-accent-strong hover:bg-accent text-[11px] rounded text-white disabled:opacity-30 transition-colors shrink-0"
>
{entry.saving ? "..." : "Save"}
</button>
@ -730,7 +730,7 @@ function AllKeysModal({
<button
type="button"
onClick={onOpenSettings}
className="text-[11px] text-accent hover:text-accent transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="text-[11px] text-accent hover:text-accent transition-colors"
>
Open Settings Panel
</button>
@ -740,7 +740,7 @@ function AllKeysModal({
<button
type="button"
onClick={onCancel}
className="px-3.5 py-1.5 text-[12px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-card border border-line rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-3.5 py-1.5 text-[12px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-card border border-line rounded-lg transition-colors"
>
Cancel Deploy
</button>
@ -748,7 +748,7 @@ function AllKeysModal({
type="button"
onClick={handleAddKeysAndDeploy}
disabled={!allSaved || anySaving}
className="px-3.5 py-1.5 text-[12px] bg-accent-strong hover:bg-accent text-white rounded-lg transition-colors disabled:opacity-40 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-3.5 py-1.5 text-[12px] bg-accent-strong hover:bg-accent text-white rounded-lg transition-colors disabled:opacity-40"
>
{anySaving ? "Saving..." : allSaved ? "Deploy" : "Add Keys"}
</button>

View File

@ -210,7 +210,7 @@ export function OnboardingWizard() {
// Was hover:bg-surface-card on top of bg-surface-card —
// silent no-op hover. Lift to surface-elevated, matching
// the Cancel pattern in ConfirmDialog.
className="px-3 py-1.5 bg-surface-card hover:bg-surface-elevated hover:text-ink rounded-lg text-[11px] text-ink-mid transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-3 py-1.5 bg-surface-card hover:bg-surface-elevated hover:text-ink rounded-lg text-[11px] text-ink-mid transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/40 focus-visible:ring-offset-2 focus-visible:ring-offset-surface-sunken"
>
Next
</button>

View File

@ -247,7 +247,7 @@ export function OrgImportPreflightModal({
<h2 id="org-preflight-title" className="text-sm font-semibold text-ink">
Deploy {orgName}
</h2>
<p className="mt-0.5 text-[11px] text-ink-mid">
<p className="mt-0.5 text-[11px] text-ink-soft">
{workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}.
Review the credentials needed before import.
</p>
@ -308,7 +308,7 @@ export function OrgImportPreflightModal({
type="button"
onClick={onProceed}
disabled={!canProceed}
className="px-4 py-1.5 text-[11px] font-semibold rounded bg-accent hover:bg-accent-strong text-white disabled:bg-surface-card disabled:text-white-soft disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-4 py-1.5 text-[11px] font-semibold rounded bg-accent hover:bg-accent-strong text-white disabled:bg-surface-card disabled:text-white-soft disabled:cursor-not-allowed"
>
Import
</button>
@ -400,7 +400,7 @@ function StrictEnvRow({
<li className="flex items-center gap-2 rounded bg-surface-sunken/70 border border-line px-2 py-1.5">
<code
className={`text-[11px] font-mono flex-1 ${
configured ? "text-ink-mid line-through" : "text-ink"
configured ? "text-ink-soft line-through" : "text-ink"
}`}
>
{envKey}
@ -428,7 +428,7 @@ function StrictEnvRow({
type="button"
onClick={() => onSave(envKey)}
disabled={d?.saving || !d?.value.trim()}
className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed"
>
{d?.saving ? "…" : "Save"}
</button>
@ -492,7 +492,7 @@ function AnyOfEnvGroup({
>
<code
className={`text-[11px] font-mono flex-1 ${
isConfigured ? "text-ink-mid line-through" : "text-ink"
isConfigured ? "text-ink-soft line-through" : "text-ink"
}`}
>
{m}
@ -520,7 +520,7 @@ function AnyOfEnvGroup({
type="button"
onClick={() => onSave(m)}
disabled={d?.saving || !d?.value.trim()}
className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-2 py-1 text-[10px] rounded bg-accent hover:bg-accent-strong text-white disabled:opacity-40 disabled:cursor-not-allowed"
>
{d?.saving ? "…" : "Save"}
</button>

View File

@ -128,9 +128,9 @@ function PlanCard({
type="button"
onClick={onSelect}
disabled={loading}
className={`mt-6 rounded-lg px-4 py-3 text-sm font-medium focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-2 focus-visible:ring-offset-surface ${
className={`mt-6 rounded-lg px-4 py-3 text-sm font-medium ${
plan.highlighted
? "bg-accent-strong text-white hover:bg-accent disabled:bg-zinc-700 disabled:text-zinc-500"
? "bg-accent-strong text-white hover:bg-accent disabled:bg-blue-900"
: "border border-line bg-surface-sunken text-ink hover:bg-surface-card disabled:opacity-50"
}`}
>

View File

@ -356,7 +356,7 @@ export function ProviderModelSelector({
<div>
<label
htmlFor={providerSelectId}
className="text-[10px] uppercase tracking-wide text-ink-mid font-semibold mb-1.5 block"
className="text-[10px] uppercase tracking-wide text-ink-soft font-semibold mb-1.5 block"
>
Provider <span aria-hidden="true" className="text-bad">*</span>
<span className="sr-only"> (required)</span>
@ -382,13 +382,13 @@ export function ProviderModelSelector({
{selected?.tooltip && (
<p
id={`${providerSelectId}-help`}
className="text-[9px] text-ink-mid mt-1 leading-relaxed"
className="text-[9px] text-ink-soft mt-1 leading-relaxed"
>
{selected.tooltip}
</p>
)}
{selected && selected.envVars.length > 0 && (
<p className="text-[9px] text-ink-mid mt-0.5 font-mono">
<p className="text-[9px] text-ink-soft mt-0.5 font-mono">
requires: {selected.envVars.join(", ")}
</p>
)}
@ -397,7 +397,7 @@ export function ProviderModelSelector({
<div>
<label
htmlFor={modelSelectId}
className="text-[10px] uppercase tracking-wide text-ink-mid font-semibold mb-1.5 block"
className="text-[10px] uppercase tracking-wide text-ink-soft font-semibold mb-1.5 block"
>
Model <span aria-hidden="true" className="text-bad">*</span>
<span className="sr-only"> (required)</span>
@ -422,7 +422,7 @@ export function ProviderModelSelector({
data-testid="model-input"
className="w-full bg-surface-sunken border border-line rounded px-2 py-1.5 text-[11px] text-ink font-mono focus:outline-none focus:border-accent focus:ring-1 focus:ring-accent/20 transition-colors disabled:opacity-50"
/>
<p className="text-[9px] text-ink-mid mt-1 leading-relaxed">
<p className="text-[9px] text-ink-soft mt-1 leading-relaxed">
{selected?.wildcard
? wildcardHelpText(selected)
: "Free-text model id. Make sure the provider can resolve it."}
@ -437,7 +437,7 @@ export function ProviderModelSelector({
handleModelChange(selected.models[0]?.id ?? "");
}
}}
className="text-[9px] text-accent hover:text-accent mt-0.5 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="text-[9px] text-accent hover:text-accent mt-0.5"
>
back to model list
</button>

View File

@ -321,7 +321,7 @@ export function ProvisioningTimeout({
onClick={() => handleDismiss(entry.workspaceId)}
aria-label="Dismiss provisioning timeout warning"
title="Dismiss — keep this workspace running without the warning"
className="shrink-0 text-warm/60 hover:text-amber-200 transition-colors -mr-1 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-amber-400 focus-visible:ring-offset-1 focus-visible:ring-offset-amber-950"
className="shrink-0 text-warm/60 hover:text-amber-200 transition-colors -mr-1"
>
<svg width="14" height="14" viewBox="0 0 16 16" fill="none" aria-hidden="true">
<path d="M4 4l8 8M12 4l-8 8" stroke="currentColor" strokeWidth="1.6" strokeLinecap="round" />
@ -341,7 +341,7 @@ export function ProvisioningTimeout({
type="button"
onClick={() => handleRetry(entry.workspaceId)}
disabled={isRetrying || isCancelling || retryCooldown.has(entry.workspaceId)}
className="px-3 py-1.5 bg-amber-600 hover:bg-amber-500 text-[11px] font-medium rounded-lg text-white disabled:opacity-40 transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-amber-400 focus-visible:ring-offset-1 focus-visible:ring-offset-amber-950"
className="px-3 py-1.5 bg-amber-600 hover:bg-amber-500 text-[11px] font-medium rounded-lg text-white disabled:opacity-40 transition-colors"
>
{isRetrying ? "Retrying..." : retryCooldown.has(entry.workspaceId) ? "Wait..." : "Retry"}
</button>
@ -349,14 +349,14 @@ export function ProvisioningTimeout({
type="button"
onClick={() => handleCancelRequest(entry.workspaceId)}
disabled={isRetrying || isCancelling}
className="px-3 py-1.5 bg-surface-card hover:bg-surface-card text-[11px] text-ink-mid rounded-lg border border-line disabled:opacity-40 transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-amber-950"
className="px-3 py-1.5 bg-surface-card hover:bg-surface-card text-[11px] text-ink-mid rounded-lg border border-line disabled:opacity-40 transition-colors"
>
{isCancelling ? "Cancelling..." : "Cancel"}
</button>
<button
type="button"
onClick={() => handleViewLogs(entry.workspaceId)}
className="px-3 py-1.5 text-[11px] text-warm hover:text-warm transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-amber-400 focus-visible:ring-offset-1 focus-visible:ring-offset-amber-950"
className="px-3 py-1.5 text-[11px] text-warm hover:text-warm transition-colors"
>
View Logs
</button>
@ -382,14 +382,14 @@ export function ProvisioningTimeout({
<button
type="button"
onClick={() => setConfirmingCancel(null)}
className="px-3.5 py-1.5 text-[12px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-card border border-line rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="px-3.5 py-1.5 text-[12px] text-ink-mid hover:text-ink bg-surface-card hover:bg-surface-card border border-line rounded-lg transition-colors"
>
Keep
</button>
<button
type="button"
onClick={handleCancelConfirm}
className="px-3.5 py-1.5 text-[12px] bg-red-600 hover:bg-red-500 text-white rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-red-400 focus-visible:ring-offset-1"
className="px-3.5 py-1.5 text-[12px] bg-red-600 hover:bg-red-500 text-white rounded-lg transition-colors"
>
Remove Workspace
</button>

View File

@ -34,8 +34,6 @@ function readPurchaseParams(): { open: boolean; item: string | null } {
function stripPurchaseParams() {
if (typeof window === "undefined") return;
const url = new URL(window.location.href);
// Skip if there are no params to strip.
if (!url.searchParams.has("purchase_success") && !url.searchParams.has("item")) return;
url.searchParams.delete("purchase_success");
url.searchParams.delete("item");
// replaceState (not pushState) so back-button doesn't return to the
@ -159,7 +157,7 @@ export function PurchaseSuccessModal() {
</div>
<div className="flex items-center justify-between gap-3 px-6 py-3 border-t border-line bg-surface/50">
<span className="font-mono text-[10.5px] uppercase tracking-[0.12em] text-ink-mid">
<span className="font-mono text-[10.5px] uppercase tracking-[0.12em] text-ink-soft">
auto-dismiss · {AUTO_DISMISS_MS / 1000}s
</span>
<button

View File

@ -104,7 +104,7 @@ export function SearchDialog() {
>
{/* Search input */}
<div className="flex items-center gap-3 px-4 py-3 border-b border-line/40">
<svg width="16" height="16" viewBox="0 0 16 16" fill="none" className="shrink-0 text-ink-mid" aria-hidden="true">
<svg width="16" height="16" viewBox="0 0 16 16" fill="none" className="shrink-0 text-ink-soft" aria-hidden="true">
<circle cx="7" cy="7" r="5.5" stroke="currentColor" strokeWidth="1.5" />
<path d="M11 11l3.5 3.5" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" />
</svg>
@ -144,10 +144,8 @@ export function SearchDialog() {
id={`search-result-${node.id}`}
role="option"
aria-selected={index === focusedIndex}
tabIndex={index === focusedIndex ? 0 : -1}
onClick={() => handleSelect(node.id)}
onFocus={() => { setFocusedIndex(index); inputRef.current?.focus(); }}
className={`w-full px-4 py-2.5 flex items-center gap-3 text-left transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface ${
className={`w-full px-4 py-2.5 flex items-center gap-3 text-left transition-colors ${
index === focusedIndex ? "bg-surface-card/60" : "hover:bg-surface-card/40"
}`}
>
@ -158,7 +156,7 @@ export function SearchDialog() {
<div className="min-w-0 flex-1">
<div className="text-sm text-ink truncate">{node.data.name}</div>
{node.data.role && (
<div className="text-[10px] text-ink-mid truncate">{node.data.role}</div>
<div className="text-[10px] text-ink-soft truncate">{node.data.role}</div>
)}
</div>
<span

View File

@ -63,21 +63,9 @@ export function SidePanel() {
? parsed
: SIDEPANEL_DEFAULT_WIDTH;
});
// On mobile (< 640px viewport) the configured width exceeds the screen,
// so the panel renders off-canvas-left. Force full-viewport width and
// disable resize on small screens; restore configured width on desktop.
const [isMobile, setIsMobile] = useState(false);
useEffect(() => {
if (typeof window === "undefined" || !window.matchMedia) return;
const mq = window.matchMedia("(max-width: 639px)");
const update = () => setIsMobile(mq.matches);
update();
mq.addEventListener("change", update);
return () => mq.removeEventListener("change", update);
}, []);
useEffect(() => {
setSidePanelWidth(isMobile ? 0 : width);
}, [width, isMobile, setSidePanelWidth]);
setSidePanelWidth(width);
}, [width, setSidePanelWidth]);
const widthRef = useRef(width); // tracks live drag value for the mouseup handler
const dragging = useRef(false);
const startX = useRef(0);
@ -149,28 +137,24 @@ export function SidePanel() {
return (
<div
className={`fixed top-0 right-0 h-full bg-surface/95 backdrop-blur-xl border-line/50 flex flex-col z-50 shadow-2xl shadow-black/50 animate-in slide-in-from-right duration-200 ${
isMobile ? "left-0 w-screen" : "border-l"
}`}
style={isMobile ? undefined : { width }}
className="fixed top-0 right-0 h-full bg-surface/95 backdrop-blur-xl border-l border-line/50 flex flex-col z-50 shadow-2xl shadow-black/50 animate-in slide-in-from-right duration-200"
style={{ width }}
>
{/* Resize handle — desktop only (no point resizing a full-screen mobile panel) */}
{!isMobile && (
<div
role="separator"
aria-label="Resize workspace panel"
aria-valuenow={width}
aria-valuemin={SIDEPANEL_MIN_WIDTH}
aria-valuemax={SIDEPANEL_MAX_WIDTH}
aria-orientation="vertical"
tabIndex={0}
onMouseDown={onMouseDown}
onKeyDown={onResizeKeyDown}
className="absolute left-0 top-0 bottom-0 w-1.5 cursor-col-resize hover:bg-accent/30 active:bg-accent/50 transition-colors z-10 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-inset"
/>
)}
{/* Resize handle */}
<div
role="separator"
aria-label="Resize workspace panel"
aria-valuenow={width}
aria-valuemin={SIDEPANEL_MIN_WIDTH}
aria-valuemax={SIDEPANEL_MAX_WIDTH}
aria-orientation="vertical"
tabIndex={0}
onMouseDown={onMouseDown}
onKeyDown={onResizeKeyDown}
className="absolute left-0 top-0 bottom-0 w-1.5 cursor-col-resize hover:bg-accent/30 active:bg-accent/50 transition-colors z-10 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-inset"
/>
{/* Header */}
<div className="flex items-center justify-between px-4 sm:px-5 py-4 border-b border-line/40 bg-surface-sunken/30">
<div className="flex items-center justify-between px-5 py-4 border-b border-line/40 bg-surface-sunken/30">
<div className="flex items-center gap-3 min-w-0">
<div className="relative">
<StatusDot status={node.data.status} size="md" />
@ -181,12 +165,12 @@ export function SidePanel() {
</h2>
<div className="flex items-center gap-2 mt-0.5">
{node.data.role && (
<span className="text-[10px] text-ink-mid truncate">
<span className="text-[10px] text-ink-soft truncate">
{node.data.role}
</span>
)}
<span className={`text-[9px] px-1.5 py-0.5 rounded-md font-mono ${
isOnline ? "text-good bg-emerald-950/30" : "text-ink-mid bg-surface-card/50"
isOnline ? "text-good bg-emerald-950/30" : "text-ink-soft bg-surface-card/50"
}`}>
T{node.data.tier}
</span>
@ -197,7 +181,7 @@ export function SidePanel() {
type="button"
onClick={() => selectNode(null)}
aria-label="Close workspace panel"
className="w-7 h-7 flex items-center justify-center rounded-lg text-ink-mid hover:text-ink hover:bg-surface-card/60 transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="w-7 h-7 flex items-center justify-center rounded-lg text-ink-soft hover:text-ink hover:bg-surface-card/60 transition-colors"
>
<svg width="12" height="12" viewBox="0 0 12 12" fill="none" aria-hidden="true">
<path d="M1 1l10 10M11 1L1 11" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" />
@ -206,7 +190,7 @@ export function SidePanel() {
</div>
{/* Capability summary */}
<div className="px-4 sm:px-5 py-3 border-b border-line/40 bg-surface-sunken/20">
<div className="px-5 py-3 border-b border-line/40 bg-surface-sunken/20">
<div className="flex flex-wrap gap-2">
<MetaPill label="Tier" value={`T${node.data.tier}`} />
<MetaPill label="Runtime" value={capability.runtime || "unknown"} />
@ -268,7 +252,7 @@ export function SidePanel() {
onClick={() => {
useCanvasStore.getState().restartWorkspace(selectedNodeId).catch(() => showToast("Restart failed", "error"));
}}
className="text-[11px] px-2 py-1 bg-sky-800/40 hover:bg-sky-700/50 text-sky-200 rounded transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="text-[11px] px-2 py-1 bg-sky-800/40 hover:bg-sky-700/50 text-sky-200 rounded transition-colors"
>
Restart Now
</button>
@ -311,8 +295,8 @@ export function SidePanel() {
</div>
{/* Footer — workspace ID */}
<div className="px-4 sm:px-5 py-2 border-t border-line/40 bg-surface-sunken/20">
<span className="text-[9px] font-mono text-ink-mid select-all block truncate">
<div className="px-5 py-2 border-t border-line/40 bg-surface-sunken/20">
<span className="text-[9px] font-mono text-ink-soft select-all">
{selectedNodeId}
</span>
</div>

View File

@ -236,7 +236,7 @@ export function OrgTemplatesSection() {
onClick={() => setExpanded((v) => !v)}
aria-expanded={expanded}
aria-controls="org-templates-body"
className="flex items-center gap-1.5 text-[10px] uppercase tracking-wide text-ink-mid hover:text-ink-mid font-semibold transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="flex items-center gap-1.5 text-[10px] uppercase tracking-wide text-ink-soft hover:text-ink-mid font-semibold transition-colors"
>
<span
aria-hidden="true"
@ -246,7 +246,7 @@ export function OrgTemplatesSection() {
</span>
Org Templates
{orgs.length > 0 && (
<span className="text-ink-mid normal-case tracking-normal">
<span className="text-ink-soft normal-case tracking-normal">
({orgs.length})
</span>
)}
@ -255,7 +255,7 @@ export function OrgTemplatesSection() {
type="button"
onClick={loadOrgs}
aria-label="Refresh org templates"
className="text-[10px] text-ink-mid hover:text-ink-mid focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="text-[10px] text-ink-soft hover:text-ink-mid"
>
</button>
@ -264,14 +264,14 @@ export function OrgTemplatesSection() {
{expanded && (
<div id="org-templates-body" className="space-y-2">
{loading && (
<div role="status" aria-live="polite" className="flex items-center gap-1.5 text-[10px] text-ink-mid">
<div role="status" aria-live="polite" className="flex items-center gap-1.5 text-[10px] text-ink-soft">
<Spinner size="sm" />
Loading
</div>
)}
{!loading && orgs.length === 0 && (
<div className="text-[10px] text-ink-mid">
<div className="text-[10px] text-ink-soft">
No org templates in <code>org-templates/</code>
</div>
)}
@ -298,7 +298,7 @@ export function OrgTemplatesSection() {
</span>
</div>
{o.description && (
<p className="text-[10px] text-ink-mid mb-2.5 line-clamp-2 leading-relaxed">
<p className="text-[10px] text-ink-soft mb-2.5 line-clamp-2 leading-relaxed">
{o.description}
</p>
)}
@ -306,7 +306,7 @@ export function OrgTemplatesSection() {
type="button"
onClick={() => handleImport(o)}
disabled={isImporting}
className="w-full px-2 py-1.5 bg-accent-strong/20 hover:bg-accent-strong/30 border border-accent/30 rounded-lg text-[10px] text-accent font-medium transition-colors disabled:opacity-50 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="w-full px-2 py-1.5 bg-accent-strong/20 hover:bg-accent-strong/30 border border-accent/30 rounded-lg text-[10px] text-accent font-medium transition-colors disabled:opacity-50"
>
{isImporting ? "Importing…" : "Import org"}
</button>
@ -411,7 +411,7 @@ function ImportAgentButton({ onImported }: { onImported: () => void }) {
type="button"
onClick={() => fileInputRef.current?.click()}
disabled={importing}
className="w-full px-3 py-2 bg-accent-strong/20 hover:bg-accent-strong/30 border border-accent/30 rounded-lg text-[11px] text-accent font-medium transition-colors disabled:opacity-50 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="w-full px-3 py-2 bg-accent-strong/20 hover:bg-accent-strong/30 border border-accent/30 rounded-lg text-[11px] text-accent font-medium transition-colors disabled:opacity-50"
>
{importing ? "Importing..." : "Import Agent Folder"}
</button>
@ -474,7 +474,7 @@ export function TemplatePalette() {
<button
type="button"
onClick={() => setOpen(!open)}
className={`fixed top-4 left-4 z-40 w-9 h-9 flex items-center justify-center rounded-lg transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 ${
className={`fixed top-4 left-4 z-40 w-9 h-9 flex items-center justify-center rounded-lg transition-colors ${
open
? "bg-accent-strong text-white"
: "bg-surface-sunken/90 border border-line/50 text-ink-mid hover:text-ink hover:border-line"
@ -499,7 +499,7 @@ export function TemplatePalette() {
<div className="fixed top-0 left-0 h-full w-[280px] bg-surface-sunken/95 backdrop-blur-md border-r border-line/60 z-30 flex flex-col shadow-2xl shadow-black/40">
<div className="px-4 pt-14 pb-3 border-b border-line/60">
<h2 className="text-sm font-semibold text-ink">Templates</h2>
<p className="text-[10px] text-ink-mid mt-0.5">Click to deploy a workspace</p>
<p className="text-[10px] text-ink-soft mt-0.5">Click to deploy a workspace</p>
</div>
<div className="flex-1 overflow-y-auto p-3 space-y-2">
@ -509,14 +509,14 @@ export function TemplatePalette() {
<OrgTemplatesSection />
{loading && (
<div role="status" aria-live="polite" className="flex items-center justify-center gap-2 text-xs text-ink-mid text-center py-8">
<div role="status" aria-live="polite" className="flex items-center justify-center gap-2 text-xs text-ink-soft text-center py-8">
<Spinner />
Loading
</div>
)}
{!loading && templates.length === 0 && (
<div role="status" aria-live="polite" className="text-xs text-ink-mid text-center py-8">
<div role="status" aria-live="polite" className="text-xs text-ink-soft text-center py-8">
No templates found in<br />workspace-configs-templates/
</div>
)}
@ -549,7 +549,7 @@ export function TemplatePalette() {
</div>
{t.description && (
<p className="text-[10px] text-ink-mid mb-2 line-clamp-2 leading-relaxed">
<p className="text-[10px] text-ink-soft mb-2 line-clamp-2 leading-relaxed">
{t.description}
</p>
)}
@ -562,7 +562,7 @@ export function TemplatePalette() {
</span>
))}
{t.skills.length > 3 && (
<span className="text-[8px] text-ink-mid">+{t.skills.length - 3}</span>
<span className="text-[8px] text-ink-soft">+{t.skills.length - 3}</span>
)}
</div>
)}
@ -580,7 +580,7 @@ export function TemplatePalette() {
<button
type="button"
onClick={loadTemplates}
className="text-[10px] text-ink-mid hover:text-ink-mid transition-colors block focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="text-[10px] text-ink-soft hover:text-ink-mid transition-colors block"
>
Refresh templates
</button>

View File

@ -124,7 +124,7 @@ export function TermsGate({ children }: { children: React.ReactNode }) {
</a>
. Click agree to continue.
</p>
<p className="mt-3 text-xs text-ink-mid">
<p className="mt-3 text-xs text-ink-soft">
By agreeing you acknowledge that workspace data is stored in AWS us-east-2 (Ohio, United States).
</p>
</div>
@ -138,7 +138,7 @@ export function TermsGate({ children }: { children: React.ReactNode }) {
// Hover goes DARKER, not lighter — emerald-500 on white
// text drops contrast below AA vs emerald-700. Same trap
// I fixed in ApprovalBanner + ConfirmDialog.
className="rounded bg-emerald-600 hover:bg-emerald-700 px-4 py-2 text-sm font-medium text-white disabled:opacity-50 transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-emerald-400 focus-visible:ring-offset-2 focus-visible:ring-offset-surface-sunken"
className="rounded bg-emerald-600 hover:bg-emerald-700 px-4 py-2 text-sm font-medium text-white disabled:opacity-50 transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-emerald-400/70 focus-visible:ring-offset-2 focus-visible:ring-offset-surface-sunken"
>
{submitting ? "Saving…" : "I agree"}
</button>

View File

@ -1,7 +1,6 @@
"use client";
import { useTheme, type ThemePreference } from "@/lib/theme-provider";
import { useCallback } from "react";
const OPTIONS: { value: ThemePreference; label: string; icon: string }[] = [
// Sun: explicit light
@ -34,47 +33,17 @@ const OPTIONS: { value: ThemePreference; label: string; icon: string }[] = [
*
* Aligned with molecule-app/components/theme-toggle.tsx so the picker
* behaves identically across surfaces.
*
* WCAG 2.4.7: focus-visible rings on all three icon buttons.
* ARIA radiogroup pattern (2.1.1): Left/Right arrow keys move focus
* between options and update selection; Home/End jump to first/last.
*/
export function ThemeToggle({ className = "" }: { className?: string }) {
const { theme, setTheme } = useTheme();
const handleKeyDown = useCallback(
(e: React.KeyboardEvent<HTMLButtonElement>, index: number) => {
let next = index;
if (e.key === "ArrowRight" || e.key === "ArrowDown") {
e.preventDefault();
next = (index + 1) % OPTIONS.length;
} else if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
e.preventDefault();
next = (index - 1 + OPTIONS.length) % OPTIONS.length;
} else if (e.key === "Home") {
e.preventDefault();
next = 0;
} else if (e.key === "End") {
e.preventDefault();
next = OPTIONS.length - 1;
} else {
return;
}
setTheme(OPTIONS[next].value);
// Move focus to the new button so arrow-key navigation is continuous
const btns = (e.currentTarget.closest("[role=radiogroup]") as HTMLElement)?.querySelectorAll<HTMLButtonElement>("[role=radio]");
btns?.[next]?.focus();
},
[]
);
return (
<div
role="radiogroup"
aria-label="Theme preference"
className={`inline-flex items-center gap-0.5 rounded-md border border-line bg-surface-sunken p-0.5 ${className}`}
>
{OPTIONS.map((opt, index) => {
{OPTIONS.map((opt) => {
const active = theme === opt.value;
return (
<button
@ -84,12 +53,11 @@ export function ThemeToggle({ className = "" }: { className?: string }) {
aria-checked={active}
aria-label={opt.label}
onClick={() => setTheme(opt.value)}
onKeyDown={(e) => handleKeyDown(e, index)}
className={
"flex h-6 w-6 items-center justify-center rounded transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1 focus-visible:ring-offset-surface-sunken " +
"flex h-6 w-6 items-center justify-center rounded transition-colors " +
(active
? "bg-surface-elevated text-ink shadow-sm"
: "text-ink-mid hover:text-ink")
: "text-ink-soft hover:text-ink-mid")
}
>
<svg

View File

@ -9,7 +9,6 @@ import { ConfirmDialog } from "@/components/ConfirmDialog";
import { showToast } from "@/components/Toaster";
import { ThemeToggle } from "@/components/ThemeToggle";
import { statusDotClass } from "@/lib/design-tokens";
import { KeyboardShortcutsDialog } from "@/components/KeyboardShortcutsDialog";
export function Toolbar() {
const nodes = useCanvasStore((s) => s.nodes);
@ -34,7 +33,6 @@ export function Toolbar() {
const [restartingAll, setRestartingAll] = useState(false);
const [restartConfirmOpen, setRestartConfirmOpen] = useState(false);
const [helpOpen, setHelpOpen] = useState(false);
const [shortcutsOpen, setShortcutsOpen] = useState(false);
const helpRef = useRef<HTMLDivElement>(null);
// Suppress toast on the very first connect at page load; only fire on reconnects.
@ -129,38 +127,15 @@ export function Toolbar() {
};
}, []);
// Global ? shortcut opens the shortcuts dialog (mirrors the help button).
// Skip when the user is typing in an input so ? in a text field doesn't
// steal focus. Also skip when a modal/dialog is already open.
useEffect(() => {
const handler = (e: KeyboardEvent) => {
if (e.key !== "?") return;
const tag = (e.target as HTMLElement).tagName;
const inInput =
tag === "INPUT" ||
tag === "TEXTAREA" ||
tag === "SELECT" ||
(e.target as HTMLElement).isContentEditable;
if (inInput) return;
// Don't fire when a modal/dialog is already mounted (canvas modals,
// side panel, etc. use z-50 or above).
if (document.querySelector('[role="dialog"][aria-modal="true"]')) return;
e.preventDefault();
setShortcutsOpen(true);
};
window.addEventListener("keydown", handler);
return () => window.removeEventListener("keydown", handler);
}, []);
return (
<div
className="fixed top-3 z-20 flex items-center gap-3 bg-surface-sunken/80 backdrop-blur-md border border-line/60 rounded-xl px-3 sm:px-4 py-2 shadow-xl shadow-black/20 transition-[margin-left] duration-200 left-2 right-2 translate-x-0 sm:left-1/2 sm:right-auto sm:-translate-x-1/2 overflow-x-auto sm:overflow-visible [&>*]:shrink-0"
className="fixed top-3 left-1/2 -translate-x-1/2 z-20 flex items-center gap-3 bg-surface-sunken/80 backdrop-blur-md border border-line/60 rounded-xl px-4 py-2 shadow-xl shadow-black/20 transition-[margin-left] duration-200"
style={toolbarOffsetStyle}
>
{/* Logo / Title — title text drops on mobile to reclaim space */}
<div className="flex items-center gap-2 sm:pr-3 sm:border-r sm:border-line/60">
{/* Logo / Title */}
<div className="flex items-center gap-2 pr-3 border-r border-line/60">
<img src="/molecule-icon.png" alt="Molecule AI" className="w-5 h-5" />
<span className="hidden sm:inline text-[11px] font-semibold text-ink-mid tracking-wide">Molecule AI</span>
<span className="text-[11px] font-semibold text-ink-mid tracking-wide">Molecule AI</span>
</div>
{/* Status pills + workspace total in one segment previously two
@ -179,15 +154,15 @@ export function Toolbar() {
{counts.failed > 0 && (
<StatusPill color={statusDotClass("failed")} count={counts.failed} label="failed" />
)}
<span className="hidden sm:inline text-ink-mid" aria-hidden="true">·</span>
<span className="hidden sm:inline text-[10px] text-ink-mid whitespace-nowrap">
<span className="text-ink-mid" aria-hidden="true">·</span>
<span className="text-[10px] text-ink-mid whitespace-nowrap">
{counts.roots} workspace{counts.roots !== 1 ? "s" : ""}
{counts.children > 0 && <span className="text-ink-mid"> + {counts.children} sub</span>}
</span>
</div>
{/* WebSocket connection status */}
<div className="sm:pl-3 sm:border-l sm:border-line/60">
<div className="pl-3 border-l border-line/60">
<WsStatusPill status={wsStatus} />
</div>
@ -280,7 +255,7 @@ export function Toolbar() {
}}
aria-label="Open audit trail for selected workspace"
title="Audit — view ledger for the selected workspace"
className="flex items-center justify-center w-7 h-7 bg-surface-card hover:bg-surface-card/70 border border-line rounded-lg transition-colors text-ink-mid hover:text-ink focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-accent focus-visible:ring-offset-1"
className="flex items-center justify-center w-7 h-7 bg-surface-card hover:bg-surface-card/70 border border-line rounded-lg transition-colors text-ink-mid hover:text-ink focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/40"
>
{/* Scroll / ledger icon */}
<svg
@ -317,7 +292,7 @@ export function Toolbar() {
onClick={() => setHelpOpen((open) => !open)}
className="flex items-center justify-center w-7 h-7 bg-surface-card hover:bg-surface-card/70 border border-line rounded-lg transition-colors text-ink-mid hover:text-ink focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/40"
aria-expanded={helpOpen}
aria-label="Open shortcuts and tips"
aria-label="Open quick help"
title="Help — shortcuts & quick start"
>
<svg width="14" height="14" viewBox="0 0 16 16" fill="none" aria-hidden="true">
@ -327,44 +302,25 @@ export function Toolbar() {
</button>
{helpOpen && (
<div
role="dialog"
aria-label="Shortcuts and tips"
aria-modal="false"
className="absolute right-0 top-full mt-2 w-80 rounded-xl border border-line/60 bg-surface/95 p-3 shadow-2xl shadow-black/50 backdrop-blur-md z-50"
>
<div className="mb-3 flex items-center justify-between">
<span className="text-[10px] font-semibold uppercase tracking-[0.24em] text-ink-mid">Shortcuts & tips</span>
<div className="absolute right-0 top-full mt-2 w-72 rounded-xl border border-line/60 bg-surface/95 p-3 shadow-2xl shadow-black/50 backdrop-blur-md">
<div className="mb-2 flex items-center justify-between">
<span className="text-[10px] font-semibold uppercase tracking-[0.24em] text-ink-mid">Quick start</span>
<button
type="button"
onClick={() => setHelpOpen(false)}
aria-label="Close help dialog"
className="text-[10px] text-ink-mid hover:text-ink transition-colors focus:outline-none focus-visible:underline"
>
Close
</button>
</div>
<div className="space-y-1.5">
<div className="space-y-2">
<HelpRow shortcut="⌘K" text="Search workspaces and jump straight into Details or Chat." />
<HelpRow shortcut="Esc" text="Clear selection, close menus, dismiss dialogs." />
<HelpRow shortcut="Enter" text="Zoom into selected team and select its first child node." />
<HelpRow shortcut="Shift+Enter" text="Select the parent of the selected node." />
<HelpRow shortcut="⌘]" text="Bring selected node forward in the z-order." />
<HelpRow shortcut="⌘[" text="Send selected node backward in the z-order." />
<HelpRow shortcut="Z" text="Zoom canvas to fit a team node and all its sub-workspaces." />
<HelpRow shortcut="Palette" text="Open the template palette to deploy a new workspace." />
<HelpRow shortcut="Right-click" text="Use node actions for duplicate, export, restart, or delete." />
<HelpRow shortcut="Dbl-click" text="On a team node: expand and zoom to show all sub-workspaces." />
<HelpRow shortcut="Shift+click" text="Multi-select: add or remove a node from the batch selection." />
<HelpRow shortcut="Chat" text="If a task is still running, the chat tab resumes that session automatically." />
<HelpRow shortcut="Config" text="Use the Config tab for skills, model, secrets, and runtime settings." />
<HelpRow shortcut="Dbl-click / Z" text="Zoom canvas to fit a team node and all its sub-workspaces." />
</div>
{/* Link to the full keyboard shortcuts dialog */}
<button
type="button"
onClick={() => { setHelpOpen(false); setShortcutsOpen(true); }}
className="mt-3 w-full text-center text-[10px] text-ink-mid hover:text-accent transition-colors focus:outline-none focus-visible:underline"
>
See all shortcuts
</button>
</div>
)}
</div>
@ -384,11 +340,6 @@ export function Toolbar() {
onConfirm={restartAll}
onCancel={() => setRestartConfirmOpen(false)}
/>
<KeyboardShortcutsDialog
open={shortcutsOpen}
onClose={() => setShortcutsOpen(false)}
/>
</div>
);
}
@ -405,30 +356,24 @@ function StatusPill({ color, count, label }: { color: string; count: number; lab
function WsStatusPill({ status }: { status: "connected" | "connecting" | "disconnected" }) {
if (status === "connected") {
return (
<div className="flex items-center gap-1.5" title="Real-time updates: connected">
{/* Decorative dot — not meaningful content for screen readers */}
<div className="flex items-center gap-1.5" title="Real-time updates: connected" aria-label="Real-time updates: connected">
<div className={`w-1.5 h-1.5 rounded-full ${statusDotClass("online")}`} aria-hidden="true" />
{/* Status text exposed to screen readers (aria-hidden removed) */}
<span className="text-[10px] text-ink-mid">Live</span>
<span className="text-[10px] text-ink-mid" aria-hidden="true">Live</span>
</div>
);
}
if (status === "connecting") {
return (
<div className="flex items-center gap-1.5" title="Real-time updates: reconnecting…">
{/* Decorative dot — not meaningful content for screen readers */}
<div className="flex items-center gap-1.5" title="Real-time updates: reconnecting…" aria-label="Real-time updates: reconnecting">
<div className="w-1.5 h-1.5 rounded-full bg-amber-400 motion-safe:animate-pulse" aria-hidden="true" />
{/* Status text exposed to screen readers (aria-hidden removed) */}
<span className="text-[10px] text-warm">Reconnecting</span>
<span className="text-[10px] text-warm" aria-hidden="true">Reconnecting</span>
</div>
);
}
return (
<div className="flex items-center gap-1.5" title="Real-time updates: disconnected">
{/* Decorative dot — not meaningful content for screen readers */}
<div className="flex items-center gap-1.5" title="Real-time updates: disconnected" aria-label="Real-time updates: disconnected">
<div className={`w-1.5 h-1.5 rounded-full ${statusDotClass("failed")}`} aria-hidden="true" />
{/* Status text exposed to screen readers (aria-hidden removed) */}
<span className="text-[10px] text-bad">Offline</span>
<span className="text-[10px] text-bad" aria-hidden="true">Offline</span>
</div>
);
}

Some files were not shown because too many files have changed in this diff Show More