Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f12c38b3f6 |
+5
-14
@@ -19,22 +19,13 @@ REDIS_URL=redis://localhost:6379
|
||||
# itself to 3000 in canvas/package.json, so sourcing this file before
|
||||
# `npm run dev` won't accidentally make Next.js try to bind 8080.
|
||||
PORT=8080
|
||||
# ---- Admin credential — REQUIRED in EVERY environment (auth is fail-closed) ----
|
||||
# Auth is fail-CLOSED everywhere now (harden/no-fail-open-auth): there is NO
|
||||
# dev-mode escape hatch. AdminAuth / WorkspaceAuth / discovery all require a
|
||||
# real credential. The canvas authenticates by sending this value as a bearer
|
||||
# (it reads NEXT_PUBLIC_ADMIN_TOKEN — set it to the SAME value).
|
||||
# ---- Admin credential — REQUIRED to close issue #684 (AdminAuth bearer bypass) ----
|
||||
# When ADMIN_TOKEN is set, only this value is accepted on /admin/* and /approvals/* routes.
|
||||
# (When unset, a fresh install 401s on admin routes and any valid workspace bearer
|
||||
# is the only deprecated fallback once tokens exist — set ADMIN_TOKEN to close #684.)
|
||||
# Generate: openssl rand -base64 32 (scripts/dev-start.sh provisions a fixed dev value)
|
||||
# Without it, any valid workspace bearer token can call admin endpoints (backward compat
|
||||
# fallback, still vulnerable). Set this in every environment, rotate when compromised.
|
||||
# Generate: openssl rand -base64 32
|
||||
# Store in fly secrets / deployment env — NEVER commit the actual value here.
|
||||
ADMIN_TOKEN=
|
||||
# NEXT_PUBLIC_ADMIN_TOKEN= # Canvas-side mirror of ADMIN_TOKEN. The canvas
|
||||
# bakes this into its bundle and sends it as the
|
||||
# bearer. MUST equal ADMIN_TOKEN (next.config.ts
|
||||
# warns if the pair is half-set). dev-start.sh
|
||||
# exports it for you.
|
||||
SECRETS_ENCRYPTION_KEY= # 32-byte key (raw or base64). Leave empty for plaintext (dev only).
|
||||
CONFIGS_DIR= # Path to workspace-configs-templates/ (auto-discovered if empty)
|
||||
PLUGINS_DIR= # Path to plugins/ directory (default: /plugins in container)
|
||||
@@ -43,7 +34,7 @@ PLUGINS_DIR= # Path to plugins/ directory (default: /plugins i
|
||||
# MOLECULE_MCP_ALLOW_SEND_MESSAGE= # Set to "true" to include send_message_to_user in the MCP bridge tool list (issue #810). Excluded by default to prevent unintended WebSocket pushes from CLI sessions.
|
||||
# MOLECULE_MCP_URL=http://localhost:8080 # Platform URL for opencode MCP config (opencode.json). Same as PLATFORM_URL; separate var so opencode configs can reference it without ambiguity.
|
||||
# WORKSPACE_DIR= # Optional global host path bind-mounted to /workspace in every container. Per-workspace workspace_dir column overrides this; if neither is set each workspace gets an isolated Docker named volume.
|
||||
MOLECULE_ENV=development # Environment label (development/staging/production). Used for log tagging and for NON-security local-dev conveniences (loopback HTTP bind, relaxed rate-limit bucket). It is NOT an auth lever — auth is fail-closed in every environment. SaaS deployments MUST set MOLECULE_ENV=production.
|
||||
MOLECULE_ENV=development # Environment label (development/staging/production). Used for log tagging and for the AdminAuth dev-mode escape hatch (lets the Canvas dashboard keep working after the first workspace is created, when ADMIN_TOKEN is unset). SaaS deployments MUST set MOLECULE_ENV=production.
|
||||
# MOLECULE_ENABLE_TEST_TOKENS= # Set to 1 to expose GET /admin/workspaces/:id/test-token (mints a fresh bearer token for E2E scripts). The route is auto-enabled when MOLECULE_ENV != production; this flag is the explicit override. Leave unset/0 in prod — the route 404s unless enabled.
|
||||
# MOLECULE_ORG_ID= # SaaS only: org UUID set by control plane on tenant machines. When set, workspace provisioning auto-routes through the control plane API instead of Docker.
|
||||
# CP_PROVISION_URL= # Override control plane URL for workspace provisioning (default: https://api.moleculesai.app). Only needed for testing against a non-production control plane.
|
||||
|
||||
@@ -8,8 +8,7 @@ pair diverges.
|
||||
Sources:
|
||||
A. `.gitea/workflows/ci.yml` jobs (CI source — the actual job set)
|
||||
B. `status_check_contexts` in branch_protections (the merge gate)
|
||||
C. `REQUIRED_CHECKS_JSON` (preferred) or `REQUIRED_CHECKS` (legacy)
|
||||
env in audit-force-merge.yml (the audit env)
|
||||
C. `REQUIRED_CHECKS` env in audit-force-merge.yml (the audit env)
|
||||
|
||||
Three failure classes:
|
||||
F1 Job in (A) is not under the sentinel's `needs:` — sentinel
|
||||
@@ -251,21 +250,13 @@ def sentinel_needs(ci_doc: dict) -> set[str]:
|
||||
return set(needs)
|
||||
|
||||
|
||||
def required_checks_env(audit_doc: dict, branch: str) -> set[str]:
|
||||
"""Pull the required-checks env value from audit-force-merge.yml.
|
||||
|
||||
def required_checks_env(audit_doc: dict) -> set[str]:
|
||||
"""Pull the REQUIRED_CHECKS env value from audit-force-merge.yml.
|
||||
Walks the YAML AST per `feedback_behavior_based_ast_gates`: we do
|
||||
NOT grep for env keys — that breaks under reformatting,
|
||||
NOT grep for `REQUIRED_CHECKS:` — that breaks under reformatting,
|
||||
multi-job workflows, or a future move of the env to a different
|
||||
step. Instead, look inside every job's every step's `env:` map.
|
||||
|
||||
Supports two variants:
|
||||
- REQUIRED_CHECKS_JSON (preferred): JSON dict keyed by branch name.
|
||||
We extract the array for the target branch.
|
||||
- REQUIRED_CHECKS (legacy): newline-separated list of context names.
|
||||
"""
|
||||
found_json: list[str] = []
|
||||
found_legacy: list[str] = []
|
||||
step. Instead, look inside every job's every step's `env:` map."""
|
||||
found: list[str] = []
|
||||
jobs = audit_doc.get("jobs", {})
|
||||
if not isinstance(jobs, dict):
|
||||
sys.stderr.write(f"::warning::{AUDIT_WORKFLOW_PATH} has no jobs: mapping\n")
|
||||
@@ -277,67 +268,27 @@ def required_checks_env(audit_doc: dict, branch: str) -> set[str]:
|
||||
if not isinstance(step, dict):
|
||||
continue
|
||||
step_env = step.get("env") or {}
|
||||
if isinstance(step_env, dict):
|
||||
if "REQUIRED_CHECKS_JSON" in step_env:
|
||||
v = step_env["REQUIRED_CHECKS_JSON"]
|
||||
if isinstance(v, str):
|
||||
found_json.append(v)
|
||||
if "REQUIRED_CHECKS" in step_env:
|
||||
v = step_env["REQUIRED_CHECKS"]
|
||||
if isinstance(v, str):
|
||||
found_legacy.append(v)
|
||||
|
||||
# JSON variant takes precedence.
|
||||
if found_json:
|
||||
if len(found_json) > 1:
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS_JSON env present in {len(found_json)} steps; ambiguous\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
try:
|
||||
parsed = json.loads(found_json[0])
|
||||
except json.JSONDecodeError as e:
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS_JSON is not valid JSON: {e}\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
if not isinstance(parsed, dict):
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS_JSON parsed to {type(parsed).__name__}, expected dict\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
branch_checks = parsed.get(branch)
|
||||
if branch_checks is None:
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS_JSON has no entry for branch '{branch}'\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
if not isinstance(branch_checks, list):
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS_JSON['{branch}'] is {type(branch_checks).__name__}, expected list\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
return {str(item).strip() for item in branch_checks if str(item).strip()}
|
||||
|
||||
# Legacy variant fallback.
|
||||
if found_legacy:
|
||||
if len(found_legacy) > 1:
|
||||
# Defensive: refuse to guess which one is canonical.
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS env present in {len(found_legacy)} steps; ambiguous\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
raw = found_legacy[0]
|
||||
# YAML block-scalars (`|`) leave a trailing newline + blanks; trim
|
||||
# consistently with audit-force-merge.sh's parser so both sides
|
||||
# produce identical sets.
|
||||
return {line.strip() for line in raw.splitlines() if line.strip()}
|
||||
|
||||
sys.stderr.write(
|
||||
f"::error::Neither REQUIRED_CHECKS_JSON nor REQUIRED_CHECKS env found in any step of "
|
||||
f"{AUDIT_WORKFLOW_PATH}\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
if isinstance(step_env, dict) and "REQUIRED_CHECKS" in step_env:
|
||||
v = step_env["REQUIRED_CHECKS"]
|
||||
if isinstance(v, str):
|
||||
found.append(v)
|
||||
if not found:
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS env not found in any step of "
|
||||
f"{AUDIT_WORKFLOW_PATH}\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
if len(found) > 1:
|
||||
# Defensive: refuse to guess which one is canonical.
|
||||
sys.stderr.write(
|
||||
f"::error::REQUIRED_CHECKS env present in {len(found)} steps; ambiguous\n"
|
||||
)
|
||||
sys.exit(3)
|
||||
raw = found[0]
|
||||
# YAML block-scalars (`|`) leave a trailing newline + blanks; trim
|
||||
# consistently with audit-force-merge.sh's parser so both sides
|
||||
# produce identical sets.
|
||||
return {line.strip() for line in raw.splitlines() if line.strip()}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
@@ -379,7 +330,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
jobs = ci_job_names(ci_doc)
|
||||
jobs_all = ci_jobs_all(ci_doc)
|
||||
needs = sentinel_needs(ci_doc)
|
||||
env_set = required_checks_env(audit_doc, branch)
|
||||
env_set = required_checks_env(audit_doc)
|
||||
|
||||
# Protection
|
||||
# api() raises ApiError on non-2xx. Transient 5xx should fail loud.
|
||||
@@ -573,7 +524,7 @@ def render_body(branch: str, findings: list[str], debug: dict) -> str:
|
||||
"- **F2**: rename the protection context to match an emitter, "
|
||||
"or remove it from `status_check_contexts` "
|
||||
"(PATCH `/api/v1/repos/{owner}/{repo}/branch_protections/{branch}`).",
|
||||
"- **F3a / F3b**: bring `REQUIRED_CHECKS_JSON` (or `REQUIRED_CHECKS` legacy) env in "
|
||||
"- **F3a / F3b**: bring `REQUIRED_CHECKS` env in "
|
||||
"`.gitea/workflows/audit-force-merge.yml` into set-equality with "
|
||||
"`status_check_contexts` (single PR, both files).",
|
||||
"",
|
||||
|
||||
@@ -26,10 +26,6 @@ PROFILES: dict[str, dict[str, str]] = {
|
||||
"handlers": (
|
||||
r"^workspace-server/internal/handlers/"
|
||||
r"|^workspace-server/internal/wsauth/"
|
||||
# #2149: the scheduler real-PG integration tests run in this same
|
||||
# workflow (they reuse its migrated Postgres), so changes to the
|
||||
# scheduler package must trigger the job too.
|
||||
r"|^workspace-server/internal/scheduler/"
|
||||
r"|^workspace-server/migrations/"
|
||||
r"|^\.gitea/workflows/handlers-postgres-integration\.yml$"
|
||||
),
|
||||
@@ -178,4 +174,3 @@ def main(argv: list[str]) -> int:
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
|
||||
|
||||
@@ -466,40 +466,12 @@ def fetch_log(target_url: str) -> str | None:
|
||||
|
||||
def grep_fail_markers(log_text: str) -> list[str]:
|
||||
"""Return up to 5 sample matching lines for any FAIL_PATTERNS hit.
|
||||
Empty list = clean log.
|
||||
|
||||
Heuristic: skip lines where the marker appears inside script source
|
||||
(e.g. ``echo "::error::..."`` in a ``::group::Run`` block) rather
|
||||
than actual execution output. The Gitea Actions log prints the raw
|
||||
script before executing it; ``echo "::error::"`` lines in that
|
||||
display are false positives.
|
||||
"""
|
||||
Empty list = clean log."""
|
||||
matches: list[str] = []
|
||||
in_run_group = False
|
||||
group_depth = 0
|
||||
for line in log_text.splitlines():
|
||||
stripped = line.strip()
|
||||
# Track Gitea Actions group markers so we can skip the
|
||||
# ``::group::Run`` script-source display blocks.
|
||||
if stripped.startswith("::group::Run"):
|
||||
in_run_group = True
|
||||
group_depth = 1
|
||||
continue
|
||||
if stripped == "::endgroup::":
|
||||
if in_run_group:
|
||||
in_run_group = False
|
||||
group_depth = 0
|
||||
continue
|
||||
if in_run_group:
|
||||
continue
|
||||
for pat in FAIL_PATTERNS:
|
||||
if pat in line:
|
||||
# Additional false-positive guard: ``echo "::error::"``
|
||||
# is script source, not a runtime error emission.
|
||||
if pat == "::error::":
|
||||
prefix = line[: line.index(pat)].strip()
|
||||
if prefix.endswith('echo') or prefix.endswith("echo '") or prefix.endswith('echo "'):
|
||||
break
|
||||
# Truncate to keep error output bounded.
|
||||
matches.append(line.strip()[:240])
|
||||
break
|
||||
if len(matches) >= 5:
|
||||
|
||||
@@ -364,71 +364,6 @@ def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
|
||||
return exc.code, None
|
||||
|
||||
|
||||
def current_branch_head(env: dict[str, str]) -> str | None:
|
||||
"""Return the SHA at the tip of the deploy branch (main) per Gitea, or None.
|
||||
|
||||
Used to detect a *superseded* deploy job (see `superseded_by`). Fail-safe:
|
||||
any read error / missing token returns None so the caller treats the job as
|
||||
NOT superseded and the strict /buildinfo verify still runs. We never let an
|
||||
unreadable head silently green a deploy.
|
||||
"""
|
||||
|
||||
token = env.get("GITEA_TOKEN", "").strip()
|
||||
if not token:
|
||||
return None
|
||||
host = env.get("GITEA_HOST", "git.moleculesai.app")
|
||||
repo = env.get("GITHUB_REPOSITORY", "molecule-ai/molecule-core")
|
||||
# Deploy lane is on: push:main; the branch is always main here, but read it
|
||||
# from the ref name when present so a future branch rename doesn't break us.
|
||||
branch = env.get("GITHUB_REF_NAME", "").strip() or "main"
|
||||
url = f"https://{host}/api/v1/repos/{repo}/branches/{quote(branch, safe='')}"
|
||||
status, body = _api_json_optional(url, token)
|
||||
if status != 200 or not isinstance(body, dict):
|
||||
return None
|
||||
commit = body.get("commit")
|
||||
if isinstance(commit, dict):
|
||||
head = commit.get("id") or commit.get("sha")
|
||||
if isinstance(head, str) and head.strip():
|
||||
return head.strip()
|
||||
return None
|
||||
|
||||
|
||||
def superseded_by(env: dict[str, str]) -> str | None:
|
||||
"""Return the newer head SHA if THIS deploy job has been superseded, else None.
|
||||
|
||||
This workflow runs with no `concurrency:` (intentional — Gitea 1.22.6 cancels
|
||||
queued runs, which is unacceptable for a prod deploy). When two main pushes
|
||||
land close together, BOTH deploy-production jobs run. The newer push rolls the
|
||||
fleet forward first; the OLDER job's strict /buildinfo verify then sees tenants
|
||||
on the NEWER SHA and false-reds with "$slug is stale" — even though the fleet
|
||||
is AHEAD, not behind. Git SHAs aren't ordered, so the verify can't tell ahead
|
||||
from behind on its own (and /buildinfo exposes only git_sha, no build time).
|
||||
|
||||
Resolve it at the source of truth for ordering — the branch ref: if main's
|
||||
current head is a DIFFERENT SHA than the one this job is deploying, a newer
|
||||
commit has landed and this job is superseded; the newest job's verify is the
|
||||
authoritative one. We return that head SHA so the caller can log it and exit
|
||||
success early, skipping the strict-equality verify for this stale job.
|
||||
|
||||
Fail-safe: returns None (NOT superseded) when the head can't be read or equals
|
||||
our SHA, so a genuinely-behind tenant under the LATEST deploy job still fails
|
||||
the strict verify loudly. This never suppresses a real-stale signal — it only
|
||||
excuses a job that is no longer the latest from asserting exact equality.
|
||||
"""
|
||||
|
||||
sha = env.get("GITHUB_SHA", "").strip()
|
||||
if not sha:
|
||||
return None
|
||||
head = current_branch_head(env)
|
||||
if not head:
|
||||
return None
|
||||
# SHA lengths can differ (short vs full); compare on the shorter prefix.
|
||||
n = min(len(head), len(sha))
|
||||
if head[:n].lower() == sha[:n].lower():
|
||||
return None
|
||||
return head
|
||||
|
||||
|
||||
def live_disable_flag(env: dict[str, str]) -> str:
|
||||
"""Return a live disable value from Gitea variables when readable.
|
||||
|
||||
@@ -507,14 +442,6 @@ def main() -> int:
|
||||
sub.add_parser("plan", help="print production deploy plan as JSON")
|
||||
sub.add_parser("assert-enabled", help="fail if production deploy is currently disabled")
|
||||
sub.add_parser("wait-ci", help="block until required CI context is green")
|
||||
sub.add_parser(
|
||||
"check-superseded",
|
||||
help=(
|
||||
"exit 0 if a newer commit has landed on the deploy branch (this job "
|
||||
"is superseded; prints the newer head SHA), exit 10 if this job is "
|
||||
"still the latest"
|
||||
),
|
||||
)
|
||||
rollout_parser = sub.add_parser("rollout", help="execute canary-first scoped production rollout")
|
||||
rollout_parser.add_argument("--plan", required=True, help="path to prod-auto-deploy plan JSON")
|
||||
rollout_parser.add_argument("--response", required=True, help="path to write aggregate response JSON")
|
||||
@@ -530,16 +457,6 @@ def main() -> int:
|
||||
if args.command == "wait-ci":
|
||||
wait_for_ci_context(dict(os.environ))
|
||||
return 0
|
||||
if args.command == "check-superseded":
|
||||
newer = superseded_by(dict(os.environ))
|
||||
if newer:
|
||||
print(newer)
|
||||
return 0
|
||||
# Exit 10 (not 0, not 1): "this job is still the latest". The
|
||||
# workflow treats only exit 0 as superseded; 10 means proceed to
|
||||
# the strict verify. A non-zero code here is informational, not a
|
||||
# failure — the workflow step swallows it.
|
||||
return 10
|
||||
if args.command == "rollout":
|
||||
rollout_from_plan_file(args.plan, args.response, dict(os.environ))
|
||||
return 0
|
||||
|
||||
@@ -1228,13 +1228,10 @@ def main(argv: list[str] | None = None) -> int:
|
||||
)
|
||||
|
||||
na_desc = ", ".join(sorted(na_descs)) if na_descs else "(none)"
|
||||
# internal#818: na-declarations is an informational context, not a merge
|
||||
# gate. An empty declaration list is a terminal success state — pending
|
||||
# here poisons the PR combined status.
|
||||
na_status_state = "success"
|
||||
na_status_state = "success" if na_descs else "pending"
|
||||
# review-check.sh reads the description to discover which gates are N/A.
|
||||
# Include the gate names so it can grep for them.
|
||||
na_description = f"N/A: {na_desc}"
|
||||
na_description = f"N/A: {na_desc}" if na_descs else "N/A: (none)"
|
||||
|
||||
if not args.dry_run:
|
||||
client.post_status(
|
||||
|
||||
@@ -114,19 +114,6 @@ if [ -z "$WHOAMI" ]; then
|
||||
fi
|
||||
echo "::notice::token resolves to user: $WHOAMI"
|
||||
|
||||
# 0.5 Read PR head SHA so we can reject stale approvals after head moves
|
||||
# (internal#816). Reviews carry the commit_id they were submitted against.
|
||||
HEAD_SHA=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}" | jq -r '.head.sha // ""') || true
|
||||
if [ -z "$HEAD_SHA" ]; then
|
||||
echo "::error::Failed to fetch PR head SHA — token may be invalid."
|
||||
if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
|
||||
echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
|
||||
exit 0
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
debug "pr-head-sha=$HEAD_SHA"
|
||||
|
||||
# 1. Read tier label. || true ensures set -euo pipefail does not abort the
|
||||
# script if curl or jq fails (e.g. 401 from empty token).
|
||||
LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') || true
|
||||
@@ -278,7 +265,7 @@ if [ $_REVIEWS_EXIT -ne 0 ] || [ -z "$REVIEWS" ]; then
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
APPROVERS=$(echo "$REVIEWS" | jq -r --arg head_sha "$HEAD_SHA" '[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]') || true
|
||||
APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') || true
|
||||
if [ -z "$APPROVERS" ]; then
|
||||
echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics."
|
||||
exit 1
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
@@ -37,76 +36,6 @@ def _make_audit_doc(required_checks: list[str]) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _make_audit_doc_json(required_checks_json: dict) -> dict:
|
||||
return {
|
||||
"jobs": {
|
||||
"audit": {
|
||||
"steps": [
|
||||
{"env": {"REQUIRED_CHECKS_JSON": json.dumps(required_checks_json)}}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# required_checks_env — dual-variant parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_required_checks_env_prefers_json_over_legacy():
|
||||
doc = {
|
||||
"jobs": {
|
||||
"audit": {
|
||||
"steps": [
|
||||
{
|
||||
"env": {
|
||||
"REQUIRED_CHECKS_JSON": json.dumps(
|
||||
{"main": ["ctx-a"], "staging": ["ctx-b"]}
|
||||
),
|
||||
"REQUIRED_CHECKS": "ctx-legacy\nctx-old",
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
assert drift.required_checks_env(doc, "main") == {"ctx-a"}
|
||||
assert drift.required_checks_env(doc, "staging") == {"ctx-b"}
|
||||
|
||||
|
||||
def test_required_checks_env_falls_back_to_legacy():
|
||||
doc = _make_audit_doc(["legacy-ctx"])
|
||||
assert drift.required_checks_env(doc, "main") == {"legacy-ctx"}
|
||||
|
||||
|
||||
def test_required_checks_env_json_missing_branch_fails():
|
||||
doc = _make_audit_doc_json({"staging": ["ctx-b"]})
|
||||
try:
|
||||
drift.required_checks_env(doc, "main")
|
||||
except SystemExit as exc:
|
||||
assert exc.code == 3
|
||||
else:
|
||||
raise AssertionError("expected SystemExit(3)")
|
||||
|
||||
|
||||
def test_required_checks_env_json_malformed_fails():
|
||||
doc = {
|
||||
"jobs": {
|
||||
"audit": {
|
||||
"steps": [
|
||||
{"env": {"REQUIRED_CHECKS_JSON": "not-json"}}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
drift.required_checks_env(doc, "main")
|
||||
except SystemExit as exc:
|
||||
assert exc.code == 3
|
||||
else:
|
||||
raise AssertionError("expected SystemExit(3)")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sentinel_needs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -1,244 +0,0 @@
|
||||
"""Live-fire regression test for #2159 — gate auto-fire runtime verification.
|
||||
|
||||
Static tests (test_gate_review_auto_fire.py) validate that the workflow YAML
|
||||
is structurally correct. This test validates the *runtime* path: submitting an
|
||||
APPROVED review to a PR whose head contains the current gate workflows causes
|
||||
Gitea Actions to queue the qa-review + security-review workflows and POST the
|
||||
branch-protection-required (pull_request_target) contexts within a reasonable
|
||||
window.
|
||||
|
||||
Skipped when Gitea API credentials are not available. Intended for:
|
||||
- manual developer verification
|
||||
- CI jobs provisioned with a service-account token
|
||||
|
||||
Environment:
|
||||
GITEA_HOST — default: git.moleculesai.app
|
||||
GITEA_TOKEN — token with read:repository + write:issues (for review POST)
|
||||
REPO — default: molecule-ai/molecule-core
|
||||
LIVEFIRE_PR_NUMBER — optional; if omitted the test tries to find a
|
||||
suitable open PR automatically, or skips.
|
||||
LIVEFIRE_TIMEOUT_SEC — default: 120
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import yaml
|
||||
|
||||
GITEA_HOST = os.environ.get("GITEA_HOST", "git.moleculesai.app")
|
||||
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
||||
REPO = os.environ.get("REPO", "molecule-ai/molecule-core")
|
||||
LIVEFIRE_PR_NUMBER = os.environ.get("LIVEFIRE_PR_NUMBER", "")
|
||||
LIVEFIRE_TIMEOUT_SEC = int(os.environ.get("LIVEFIRE_TIMEOUT_SEC", "120"))
|
||||
|
||||
REQUIRED_CONTEXTS = [
|
||||
"qa-review / approved (pull_request_target)",
|
||||
"security-review / approved (pull_request_target)",
|
||||
]
|
||||
|
||||
skip_no_token = pytest.mark.skipif(
|
||||
not GITEA_TOKEN,
|
||||
reason="GITEA_TOKEN not set — live-fire test requires API credentials",
|
||||
)
|
||||
|
||||
|
||||
def _api(method: str, path: str, body: dict | None = None) -> tuple[int, dict]:
|
||||
url = f"https://{GITEA_HOST}/api/v1{path}"
|
||||
headers = {
|
||||
"Authorization": f"token {GITEA_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
data = json.dumps(body).encode() if body else None
|
||||
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
raw = resp.read()
|
||||
code = resp.status
|
||||
except urllib.error.HTTPError as exc:
|
||||
raw = exc.read()
|
||||
code = exc.code
|
||||
payload = json.loads(raw) if raw else {}
|
||||
return code, payload
|
||||
|
||||
|
||||
def _get_pr(number: int) -> dict:
|
||||
code, pr = _api("GET", f"/repos/{REPO}/pulls/{number}")
|
||||
if code != 200:
|
||||
pytest.fail(f"GET /pulls/{number} returned HTTP {code}: {pr}")
|
||||
return pr
|
||||
|
||||
|
||||
def _list_open_prs() -> list[dict]:
|
||||
code, prs = _api("GET", f"/repos/{REPO}/pulls?state=open&limit=50")
|
||||
if code != 200:
|
||||
pytest.fail(f"GET /pulls?state=open returned HTTP {code}: {prs}")
|
||||
return prs
|
||||
|
||||
|
||||
def _pr_has_trigger_in_head(pr: dict) -> bool:
|
||||
"""Return True if the PR head contains pull_request_review in both workflows."""
|
||||
head_sha = pr["head"]["sha"]
|
||||
for wf_name in ("qa-review.yml", "security-review.yml"):
|
||||
path = f"/repos/{REPO}/contents/.gitea/workflows/{wf_name}?ref={head_sha}"
|
||||
code, payload = _api("GET", path)
|
||||
if code != 200:
|
||||
return False
|
||||
raw = base64.b64decode(payload.get("content", "")).decode("utf-8")
|
||||
wf = yaml.safe_load(raw)
|
||||
on = wf.get(True) or wf.get("on") or {}
|
||||
if isinstance(on, str):
|
||||
if on != "pull_request_review":
|
||||
return False
|
||||
elif "pull_request_review" not in on:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _find_suitable_pr() -> dict:
|
||||
if LIVEFIRE_PR_NUMBER:
|
||||
pr = _get_pr(int(LIVEFIRE_PR_NUMBER))
|
||||
if pr.get("state") != "open":
|
||||
pytest.skip(f"PR {LIVEFIRE_PR_NUMBER} is not open")
|
||||
return pr
|
||||
|
||||
prs = _list_open_prs()
|
||||
for pr in prs:
|
||||
if _pr_has_trigger_in_head(pr):
|
||||
return pr
|
||||
pytest.skip("No open PR found whose head contains the pull_request_review trigger")
|
||||
|
||||
|
||||
def _submit_approved_review(pr_number: int) -> dict:
|
||||
code, review = _api(
|
||||
"POST",
|
||||
f"/repos/{REPO}/pulls/{pr_number}/reviews",
|
||||
{"body": "Live-fire test APPROVED review", "event": "APPROVED"},
|
||||
)
|
||||
# 200 = created, 422 = review already exists (idempotent enough for our purposes)
|
||||
if code not in (200, 201, 422):
|
||||
pytest.fail(f"POST /pulls/{pr_number}/reviews returned HTTP {code}")
|
||||
return review
|
||||
|
||||
|
||||
def _get_status_snapshot(sha: str) -> dict[str, dict]:
|
||||
"""Return mapping context -> {id, updated_at, target_url} for required contexts."""
|
||||
code, statuses = _api("GET", f"/repos/{REPO}/statuses/{sha}?limit=100")
|
||||
if code != 200:
|
||||
return {}
|
||||
result: dict[str, dict] = {}
|
||||
for st in statuses:
|
||||
ctx = st.get("context", "")
|
||||
if ctx in REQUIRED_CONTEXTS:
|
||||
result[ctx] = {
|
||||
"id": st.get("id"),
|
||||
"updated_at": st.get("updated_at", st.get("created_at", "")),
|
||||
"target_url": st.get("target_url"),
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
def _extract_run_id(target_url: str | None) -> str | None:
|
||||
"""Extract the Actions run_id from a status target_url."""
|
||||
if not target_url:
|
||||
return None
|
||||
m = re.search(r"/actions/runs/(\d+)", target_url)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def _poll_fresh_statuses(
|
||||
sha: str,
|
||||
prior_snapshot: dict[str, dict],
|
||||
timeout_sec: int = LIVEFIRE_TIMEOUT_SEC,
|
||||
) -> dict[str, dict]:
|
||||
"""Poll until required contexts appear fresh (newer timestamp, id, or run)."""
|
||||
deadline = time.monotonic() + timeout_sec
|
||||
found: dict[str, dict] = {}
|
||||
while time.monotonic() < deadline:
|
||||
code, statuses = _api("GET", f"/repos/{REPO}/statuses/{sha}?limit=100")
|
||||
if code == 200:
|
||||
for st in statuses:
|
||||
ctx = st.get("context", "")
|
||||
if ctx in REQUIRED_CONTEXTS:
|
||||
updated_at = st.get("updated_at", st.get("created_at", ""))
|
||||
status_id = st.get("id")
|
||||
target_url = st.get("target_url")
|
||||
prior = prior_snapshot.get(ctx, {})
|
||||
# Fresh if timestamp changed, id changed, or target_url changed.
|
||||
is_fresh = (
|
||||
ctx not in prior_snapshot
|
||||
or updated_at != prior.get("updated_at", "")
|
||||
or status_id != prior.get("id")
|
||||
or target_url != prior.get("target_url")
|
||||
)
|
||||
if is_fresh:
|
||||
found[ctx] = {
|
||||
"state": st.get("state", st.get("status", "")),
|
||||
"updated_at": updated_at,
|
||||
"id": status_id,
|
||||
"target_url": target_url,
|
||||
}
|
||||
if all(ctx in found for ctx in REQUIRED_CONTEXTS):
|
||||
return found
|
||||
time.sleep(5)
|
||||
return found
|
||||
|
||||
|
||||
@skip_no_token
|
||||
class TestGateAutoFireLive:
|
||||
def test_auto_fire_posts_required_contexts(self):
|
||||
"""Submit APPROVED review; assert BP-required contexts appear fresh within timeout."""
|
||||
pr = _find_suitable_pr()
|
||||
pr_number = pr["number"]
|
||||
head_sha = pr["head"]["sha"]
|
||||
|
||||
# Capture pre-existing status snapshot so we can prove FRESH contexts
|
||||
# were posted after the review submission (not stale from a prior run).
|
||||
prior_snapshot = _get_status_snapshot(head_sha)
|
||||
prior_run_ids = {
|
||||
_extract_run_id(s["target_url"])
|
||||
for s in prior_snapshot.values()
|
||||
if _extract_run_id(s["target_url"])
|
||||
}
|
||||
|
||||
review = _submit_approved_review(pr_number)
|
||||
|
||||
found = _poll_fresh_statuses(head_sha, prior_snapshot)
|
||||
|
||||
missing = [ctx for ctx in REQUIRED_CONTEXTS if ctx not in found]
|
||||
if missing:
|
||||
pytest.fail(
|
||||
f"After {LIVEFIRE_TIMEOUT_SEC}s, fresh contexts still missing: {missing}. "
|
||||
f"Found: {found}. Prior snapshot: {prior_snapshot}. "
|
||||
f"PR #{pr_number} head={head_sha}. "
|
||||
f"This indicates the pull_request_review trigger did not fire at runtime."
|
||||
)
|
||||
|
||||
# The contexts appeared fresh — that's the proof of auto-fire.
|
||||
# We do NOT assert success vs failure; the evaluator decides that.
|
||||
# The point of #2159 is that the workflows QUEUE and POST at all.
|
||||
for ctx, info in found.items():
|
||||
state = info["state"]
|
||||
assert state in ("pending", "success", "failure"), (
|
||||
f"Unexpected state {state!r} for {ctx}"
|
||||
)
|
||||
|
||||
# CR2 Finding 1: prove a NEW workflow run was triggered, not just
|
||||
# an in-place status update. Gitea 1.22.6 lacks REST /actions/runs/*
|
||||
# endpoints, so we use the run_id embedded in the status target_url
|
||||
# as a proxy for distinct run_id.
|
||||
run_id = _extract_run_id(info.get("target_url"))
|
||||
if run_id and run_id in prior_run_ids:
|
||||
pytest.fail(
|
||||
f"Context {ctx!r} has target_url run_id {run_id} which existed "
|
||||
f"BEFORE the review was submitted. This means the status was "
|
||||
f"updated in-place by an existing run, not by a new workflow "
|
||||
f"run triggered from the pull_request_review event."
|
||||
)
|
||||
@@ -1,145 +0,0 @@
|
||||
"""Stale-head diagnostic test for #2159.
|
||||
|
||||
Deterministically reports whether a PR's HEAD contains the pull_request_review
|
||||
trigger in qa-review.yml and security-review.yml. If the trigger is absent,
|
||||
auto-fire on APPROVED review is impossible for that PR.
|
||||
|
||||
This is used as a self-diagnostic for future stale-PR situations (PRs opened
|
||||
before #2157 merged, or branches cut from old bases).
|
||||
|
||||
Environment:
|
||||
GITEA_HOST — default: git.moleculesai.app
|
||||
GITEA_TOKEN — token with read:repository scope (optional; falls back to local files)
|
||||
REPO — default: molecule-ai/molecule-core
|
||||
PR_NUMBER — required when running against a real PR
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import yaml
|
||||
|
||||
GITEA_HOST = os.environ.get("GITEA_HOST", "git.moleculesai.app")
|
||||
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
|
||||
REPO = os.environ.get("REPO", "molecule-ai/molecule-core")
|
||||
PR_NUMBER = os.environ.get("PR_NUMBER", "")
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
def _api(method: str, path: str) -> tuple[int, dict]:
|
||||
url = f"https://{GITEA_HOST}/api/v1{path}"
|
||||
headers = {"Authorization": f"token {GITEA_TOKEN}"}
|
||||
req = urllib.request.Request(url, headers=headers, method=method)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return resp.status, json.loads(resp.read())
|
||||
except urllib.error.HTTPError as exc:
|
||||
body = exc.read()
|
||||
return exc.code, json.loads(body) if body else {}
|
||||
|
||||
|
||||
def _fetch_workflow_from_ref(workflow_name: str, ref: str) -> dict:
|
||||
path = f"/repos/{REPO}/contents/.gitea/workflows/{workflow_name}?ref={ref}"
|
||||
code, payload = _api("GET", path)
|
||||
if code != 200:
|
||||
pytest.fail(
|
||||
f"GET {path} returned HTTP {code}: {payload}. "
|
||||
f"Cannot determine whether PR head contains the trigger."
|
||||
)
|
||||
raw = base64.b64decode(payload.get("content", "")).decode("utf-8")
|
||||
return yaml.safe_load(raw)
|
||||
|
||||
|
||||
def _fetch_workflow_local(workflow_name: str) -> dict:
|
||||
p = ROOT / "workflows" / workflow_name
|
||||
if not p.exists():
|
||||
pytest.fail(f"Local workflow file not found: {p}")
|
||||
return yaml.safe_load(p.read_text())
|
||||
|
||||
|
||||
def _has_pull_request_review_trigger(wf: dict) -> bool:
|
||||
on = wf.get(True) or wf.get("on") or {}
|
||||
if isinstance(on, list):
|
||||
return "pull_request_review" in on
|
||||
if isinstance(on, dict):
|
||||
return "pull_request_review" in on
|
||||
if isinstance(on, str):
|
||||
return on == "pull_request_review"
|
||||
return False
|
||||
|
||||
|
||||
def _diagnose_pr(pr_number: int) -> dict[str, bool]:
|
||||
code, pr = _api("GET", f"/repos/{REPO}/pulls/{pr_number}")
|
||||
if code != 200:
|
||||
pytest.fail(f"GET /pulls/{pr_number} returned HTTP {code}: {pr}")
|
||||
|
||||
head_ref = pr["head"]["ref"]
|
||||
head_sha = pr["head"]["sha"]
|
||||
|
||||
results: dict[str, bool] = {}
|
||||
for wf_name in ("qa-review.yml", "security-review.yml"):
|
||||
wf = _fetch_workflow_from_ref(wf_name, head_sha)
|
||||
results[wf_name] = _has_pull_request_review_trigger(wf)
|
||||
|
||||
return {
|
||||
"pr_number": pr_number,
|
||||
"head_ref": head_ref,
|
||||
"head_sha": head_sha,
|
||||
"triggers": results,
|
||||
"auto_fire_possible": all(results.values()),
|
||||
}
|
||||
|
||||
|
||||
def _diagnose_local() -> dict[str, bool]:
|
||||
results: dict[str, bool] = {}
|
||||
for wf_name in ("qa-review.yml", "security-review.yml"):
|
||||
wf = _fetch_workflow_local(wf_name)
|
||||
results[wf_name] = _has_pull_request_review_trigger(wf)
|
||||
return {
|
||||
"pr_number": None,
|
||||
"head_ref": "local-checkout",
|
||||
"head_sha": None,
|
||||
"triggers": results,
|
||||
"auto_fire_possible": all(results.values()),
|
||||
}
|
||||
|
||||
|
||||
class TestStaleHeadDiagnostic:
|
||||
"""Test deterministically reports 'auto-fire impossible for this PR' when
|
||||
the PR head lacks the pull_request_review trigger.
|
||||
"""
|
||||
|
||||
def test_local_checkout_has_pull_request_review_trigger(self):
|
||||
"""Local files (the ones in this checkout) must contain the trigger.
|
||||
|
||||
This is the baseline: if the checkout itself is stale, every PR cut
|
||||
from it will also be stale.
|
||||
"""
|
||||
diag = _diagnose_local()
|
||||
missing = [n for n, ok in diag["triggers"].items() if not ok]
|
||||
if missing:
|
||||
pytest.fail(
|
||||
f"Local checkout is missing pull_request_review trigger in: {missing}. "
|
||||
f"This branch cannot produce PRs that auto-fire."
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(not GITEA_TOKEN, reason="GITEA_TOKEN not set")
|
||||
@pytest.mark.skipif(not PR_NUMBER, reason="PR_NUMBER not set")
|
||||
def test_pr_head_has_pull_request_review_trigger(self):
|
||||
"""When PR_NUMBER is given, assert the PR head contains the trigger."""
|
||||
diag = _diagnose_pr(int(PR_NUMBER))
|
||||
if not diag["auto_fire_possible"]:
|
||||
missing = [n for n, ok in diag["triggers"].items() if not ok]
|
||||
pytest.fail(
|
||||
f"Auto-fire impossible for PR #{diag['pr_number']}. "
|
||||
f"Head ref={diag['head_ref']} sha={diag['head_sha']}. "
|
||||
f"Missing trigger in: {missing}. "
|
||||
f"This PR needs /qa-recheck + /security-recheck fallback, or a rebase onto current main."
|
||||
)
|
||||
@@ -486,129 +486,3 @@ def test_scoped_rollout_dry_run_does_not_assert_coverage():
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
assert aggregate["ok"] is True
|
||||
|
||||
|
||||
# --- Superseded-deploy guard (false-stale fix) -----------------------------
|
||||
#
|
||||
# Scenario this fixes: no `concurrency:` on the prod-deploy workflow means two
|
||||
# close main pushes run BOTH deploy-production jobs. eb31bcf (Fix A) and 286338
|
||||
# (Fix C) merge back-to-back; the 286338 job rolls the fleet to staging-2863380
|
||||
# first; the OLDER eb31bcf job's strict verify then sees tenants on 2863380 and
|
||||
# false-reds "stale" though the fleet is AHEAD. superseded_by detects that main's
|
||||
# head is no longer eb31bcf and lets the older job succeed without weakening the
|
||||
# behind-tenant signal for whichever job IS the latest.
|
||||
|
||||
|
||||
def test_superseded_by_returns_newer_head_when_main_moved_ahead(monkeypatch):
|
||||
# eb31bcf job: main head is now 2863380 -> superseded, return the newer head.
|
||||
monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash")
|
||||
newer = prod.superseded_by({"GITHUB_SHA": "eb31bcffullhash"})
|
||||
assert newer == "2863380fullhash"
|
||||
|
||||
|
||||
def test_superseded_by_none_when_this_job_is_still_head(monkeypatch):
|
||||
# 2863380 job (the latest): head == our SHA -> NOT superseded -> strict verify
|
||||
# runs, so a genuinely-behind tenant still fails loudly.
|
||||
monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash")
|
||||
assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None
|
||||
|
||||
|
||||
def test_superseded_by_matches_on_short_vs_full_sha_prefix(monkeypatch):
|
||||
# GITHUB_SHA is full; Gitea may return a different-length id. Equal prefixes
|
||||
# must NOT count as superseded (avoid false-skipping the real latest job).
|
||||
monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380")
|
||||
assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None
|
||||
monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380FULLHASH")
|
||||
assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None
|
||||
|
||||
|
||||
def test_superseded_by_fail_safe_returns_none_when_head_unreadable(monkeypatch):
|
||||
# Fail-safe: unreadable head (no token / API error) must NOT be treated as
|
||||
# superseded, so the strict verify still runs and never silently greens.
|
||||
monkeypatch.setattr(prod, "current_branch_head", lambda _env: None)
|
||||
assert prod.superseded_by({"GITHUB_SHA": "eb31bcffullhash"}) is None
|
||||
|
||||
|
||||
def test_superseded_by_none_without_github_sha(monkeypatch):
|
||||
monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash")
|
||||
assert prod.superseded_by({}) is None
|
||||
|
||||
|
||||
def test_current_branch_head_parses_gitea_branch_commit_id(monkeypatch):
|
||||
captured = {}
|
||||
|
||||
def fake_optional(url, _token):
|
||||
captured["url"] = url
|
||||
return 200, {"name": "main", "commit": {"id": "2863380fullhash"}}
|
||||
|
||||
monkeypatch.setattr(prod, "_api_json_optional", fake_optional)
|
||||
head = prod.current_branch_head(
|
||||
{"GITEA_TOKEN": "secret", "GITHUB_REPOSITORY": "molecule-ai/molecule-core"}
|
||||
)
|
||||
assert head == "2863380fullhash"
|
||||
assert captured["url"].endswith("/repos/molecule-ai/molecule-core/branches/main")
|
||||
|
||||
|
||||
def test_current_branch_head_uses_ref_name_branch(monkeypatch):
|
||||
captured = {}
|
||||
|
||||
def fake_optional(url, _token):
|
||||
captured["url"] = url
|
||||
return 200, {"commit": {"sha": "deadbeef"}}
|
||||
|
||||
monkeypatch.setattr(prod, "_api_json_optional", fake_optional)
|
||||
head = prod.current_branch_head(
|
||||
{"GITEA_TOKEN": "secret", "GITHUB_REF_NAME": "release"}
|
||||
)
|
||||
assert head == "deadbeef"
|
||||
assert captured["url"].endswith("/branches/release")
|
||||
|
||||
|
||||
def test_current_branch_head_none_without_token():
|
||||
assert prod.current_branch_head({}) is None
|
||||
|
||||
|
||||
def test_current_branch_head_none_on_non_200(monkeypatch):
|
||||
monkeypatch.setattr(prod, "_api_json_optional", lambda _u, _t: (500, None))
|
||||
assert prod.current_branch_head({"GITEA_TOKEN": "secret"}) is None
|
||||
|
||||
|
||||
# --- #2213: superseded check must fire BEFORE production side effects ----------
|
||||
#
|
||||
# Real incident shape: two main pushes land ~2 min apart. The OLDER deploy job
|
||||
# (GITHUB_SHA=7a72516, target staging-7a72516) started LATE — main head was
|
||||
# already 7f25373. The #2194 guard only protected the *verify* step, so the
|
||||
# older job still:
|
||||
# 1. rolled the canary (hongming) BACKWARD to staging-7a72516 (the #2213 red,
|
||||
# seen as the newer job's verify reading hongming on the old SHA), then
|
||||
# 2. promoted :latest backward to the older image,
|
||||
# before finally skipping verify. The workflow now calls this same superseded
|
||||
# check BEFORE the redeploy + promote steps and gates both off when it fires.
|
||||
# These tests pin the contract that check-superseded relies on for the exact
|
||||
# incident shape.
|
||||
|
||||
|
||||
def test_superseded_by_fires_for_older_job_when_newer_already_head(monkeypatch):
|
||||
# Older job (7a72516) re-checks the head just before rollout and finds the
|
||||
# newer merge (7f25373) already owns main -> superseded -> skip side effects.
|
||||
monkeypatch.setattr(
|
||||
prod, "current_branch_head", lambda _env: "7f25373309eca54a36f08c371ff783c3a47c3f8d"
|
||||
)
|
||||
newer = prod.superseded_by(
|
||||
{"GITHUB_SHA": "7a72516f7e7ba1a710c4f393fef08be8d22e1866"}
|
||||
)
|
||||
assert newer == "7f25373309eca54a36f08c371ff783c3a47c3f8d"
|
||||
|
||||
|
||||
def test_superseded_by_none_for_latest_job_so_it_still_rolls(monkeypatch):
|
||||
# The newer job (7f25373) IS the head -> NOT superseded -> it proceeds to
|
||||
# roll the fleet and verify, so a genuinely-behind tenant still fails loud.
|
||||
monkeypatch.setattr(
|
||||
prod, "current_branch_head", lambda _env: "7f25373309eca54a36f08c371ff783c3a47c3f8d"
|
||||
)
|
||||
assert (
|
||||
prod.superseded_by(
|
||||
{"GITHUB_SHA": "7f25373309eca54a36f08c371ff783c3a47c3f8d"}
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
@@ -1299,108 +1299,3 @@ class TestGetCIStatus(unittest.TestCase):
|
||||
self.assertEqual(
|
||||
sop.get_ci_status(client, "o", "r", "sha1"), "unknown"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# internal#818 — na-declarations status must be terminal success
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestNaDeclarationsStatusTerminal(unittest.TestCase):
|
||||
"""Regression for internal#818: the na-declarations context is
|
||||
informational, not a merge gate. An empty N/A declaration list must
|
||||
post `success` (not `pending`) so it does not poison the PR combined
|
||||
status."""
|
||||
|
||||
def _run_with_fake_client(self, fake_client_class):
|
||||
"""Swap GiteaClient temporarily and invoke main() with a fake token."""
|
||||
orig_client = sop.GiteaClient
|
||||
orig_token = os.environ.get("GITEA_TOKEN")
|
||||
try:
|
||||
sop.GiteaClient = fake_client_class
|
||||
os.environ["GITEA_TOKEN"] = "fake-token"
|
||||
return sop.main([
|
||||
"--owner", "o", "--repo", "r", "--pr", "1",
|
||||
"--config", CONFIG_PATH,
|
||||
"--gitea-host", "git.example.com",
|
||||
])
|
||||
finally:
|
||||
sop.GiteaClient = orig_client
|
||||
if orig_token is None:
|
||||
os.environ.pop("GITEA_TOKEN", None)
|
||||
else:
|
||||
os.environ["GITEA_TOKEN"] = orig_token
|
||||
|
||||
def test_empty_na_descriptions_posts_success(self):
|
||||
posted = []
|
||||
|
||||
class FakeClient(sop.GiteaClient):
|
||||
def get_pr(self, owner, repo, pr):
|
||||
return {
|
||||
"state": "open",
|
||||
"user": {"login": "alice"},
|
||||
"head": {"sha": "abc123"},
|
||||
"labels": [],
|
||||
}
|
||||
|
||||
def get_issue_comments(self, owner, repo, issue, max_comments=None):
|
||||
return []
|
||||
|
||||
def resolve_team_id(self, org, team_name):
|
||||
return None
|
||||
|
||||
def is_team_member(self, team_id, login):
|
||||
return False
|
||||
|
||||
def post_status(self, owner, repo, sha, state, context,
|
||||
description, target_url=""):
|
||||
posted.append({
|
||||
"state": state,
|
||||
"context": context,
|
||||
"description": description,
|
||||
})
|
||||
|
||||
rc = self._run_with_fake_client(FakeClient)
|
||||
self.assertEqual(rc, 0)
|
||||
na_posts = [p for p in posted if "na-declarations" in p["context"]]
|
||||
self.assertEqual(len(na_posts), 1, f"expected one na-declarations post, got {posted}")
|
||||
self.assertEqual(na_posts[0]["state"], "success")
|
||||
self.assertEqual(na_posts[0]["description"], "N/A: (none)")
|
||||
|
||||
def test_populated_na_descriptions_posts_success(self):
|
||||
posted = []
|
||||
|
||||
class FakeClient(sop.GiteaClient):
|
||||
def get_pr(self, owner, repo, pr):
|
||||
return {
|
||||
"state": "open",
|
||||
"user": {"login": "alice"},
|
||||
"head": {"sha": "abc123"},
|
||||
"labels": [],
|
||||
}
|
||||
|
||||
def get_issue_comments(self, owner, repo, issue, max_comments=None):
|
||||
return [
|
||||
{"user": {"login": "bob"}, "body": "/sop-n/a qa-review N/A: docs-only"},
|
||||
]
|
||||
|
||||
def resolve_team_id(self, org, team_name):
|
||||
return 1
|
||||
|
||||
def is_team_member(self, team_id, login):
|
||||
return True
|
||||
|
||||
def post_status(self, owner, repo, sha, state, context,
|
||||
description, target_url=""):
|
||||
posted.append({
|
||||
"state": state,
|
||||
"context": context,
|
||||
"description": description,
|
||||
})
|
||||
|
||||
rc = self._run_with_fake_client(FakeClient)
|
||||
self.assertEqual(rc, 0)
|
||||
na_posts = [p for p in posted if "na-declarations" in p["context"]]
|
||||
self.assertEqual(len(na_posts), 1)
|
||||
self.assertEqual(na_posts[0]["state"], "success")
|
||||
self.assertIn("qa-review", na_posts[0]["description"])
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Regression test for internal#816 — sop-tier-check must ignore APPROVED
|
||||
# reviews that were submitted against an old PR head SHA.
|
||||
#
|
||||
# Bug: the script collected approvers with
|
||||
# jq '[.[] | select(.state=="APPROVED") | .user.login]'
|
||||
# without filtering on .commit_id == HEAD_SHA. After a PR head moved,
|
||||
# stale approvals looked valid to the tier gate.
|
||||
#
|
||||
# Fix: the jq filter now includes
|
||||
# select(.state=="APPROVED" and .commit_id == $head_sha)
|
||||
# where $head_sha is the current PR head fetched from the API.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# jq may not be on PATH in all environments (e.g. dev containers).
|
||||
PATH="/tmp/bin:$PATH"
|
||||
command -v jq >/dev/null 2>&1 || { echo "::error::jq required but not found"; exit 1; }
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
assert_eq() {
|
||||
local label="$1"
|
||||
local expected="$2"
|
||||
local got="$3"
|
||||
if [ "$expected" = "$got" ]; then
|
||||
echo " PASS $label"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo " FAIL $label"
|
||||
echo " expected: <$expected>"
|
||||
echo " got: <$got>"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
# Sample reviews matching the shape from Gitea API
|
||||
REVIEWS_JSON='[
|
||||
{"state":"APPROVED","commit_id":"abc123","user":{"login":"bob"}},
|
||||
{"state":"APPROVED","commit_id":"old456","user":{"login":"alice"}},
|
||||
{"state":"COMMENT","commit_id":"abc123","user":{"login":"carol"}},
|
||||
{"state":"APPROVED","commit_id":"abc123","user":{"login":"dave"}},
|
||||
{"state":"REQUEST_CHANGES","commit_id":"abc123","user":{"login":"eve"}}
|
||||
]'
|
||||
|
||||
echo "test: jq filter keeps only APPROVED on current head"
|
||||
GOT=$(echo "$REVIEWS_JSON" | jq -r --arg head_sha "abc123" \
|
||||
'[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]')
|
||||
assert_eq "current-head approvers" "bob dave" "$(echo "$GOT" | tr '\n' ' ' | sed 's/ $//')"
|
||||
|
||||
echo "test: jq filter with all-stale reviews yields empty"
|
||||
GOT=$(echo "$REVIEWS_JSON" | jq -r --arg head_sha "new789" \
|
||||
'[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]')
|
||||
assert_eq "all-stale yields empty" "" "$GOT"
|
||||
|
||||
echo "test: jq filter handles null commit_id gracefully"
|
||||
NULL_JSON='[{"state":"APPROVED","commit_id":null,"user":{"login":"mallory"}}]'
|
||||
GOT=$(echo "$NULL_JSON" | jq -r --arg head_sha "abc123" \
|
||||
'[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]')
|
||||
assert_eq "null commit_id excluded" "" "$GOT"
|
||||
|
||||
echo
|
||||
echo "------"
|
||||
echo "PASS=$PASS FAIL=$FAIL"
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -96,7 +96,6 @@ env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
jobs:
|
||||
# bp-exempt: advisory arm64 pilot, non-gating by design (internal#418).
|
||||
fast-checks:
|
||||
name: fast-checks
|
||||
# AND-set: only the Mac arm64 runner advertises macos-self-hosted.
|
||||
|
||||
+38
-56
@@ -25,9 +25,10 @@
|
||||
# sufficient for `actions/checkout` against this same repo.
|
||||
#
|
||||
# 4. Docs — no docs/scripts reference github.com URLs that need swapping.
|
||||
# The canvas-deploy-status step (core#2226, formerly canvas-deploy-reminder)
|
||||
# writes the canvas ordered-deploy status into the step summary; it points
|
||||
# at the ECR canvas image and the publish workflow, no ghcr.io prose.
|
||||
# The canvas-deploy-reminder step writes a `ghcr.io/...` image
|
||||
# reference into the step summary text — that's documentation prose
|
||||
# pointing at the ECR-mirrored canvas image and stays unchanged for
|
||||
# this port (a separate cleanup if ghcr→ECR sweep is in scope).
|
||||
#
|
||||
# Cross-links:
|
||||
# - RFC: internal#219 (CI/CD hard-gate hardening)
|
||||
@@ -364,25 +365,6 @@ jobs:
|
||||
# check missed. If a refactor weakens the gate to a shape check,
|
||||
# this step goes red on every PR.
|
||||
bash tests/e2e/test_completion_assert_unit.sh
|
||||
# harden/e2e-staging-saas-failclosed: fail-direction proof for the
|
||||
# E2E_REQUIRE_LIVE fail-closed-on-skip guard in
|
||||
# test_staging_full_saas.sh. Offline (no LLM/network/provisioning):
|
||||
# asserts the guard exits 5 when a live lifecycle did NOT run and
|
||||
# passes when all milestones fired — so a refactor that lets the
|
||||
# staging gate report green without a real provision→online→A2A
|
||||
# cycle goes red on every PR.
|
||||
bash tests/e2e/test_require_live_guard_unit.sh
|
||||
# harden/enforce-ci-gates-core-v2 (PR #2286): fail-direction proof
|
||||
# for the E2E_REQUIRE_LIVE zero-validated gate in
|
||||
# test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`).
|
||||
# Offline (no LLM/network/provisioning): sources that script under
|
||||
# its unit source-guard and drives the REAL evaluate_require_live_gate
|
||||
# — asserts REQUIRE_LIVE=1 + zero validated → RED (the false-green
|
||||
# trap), REQUIRE_LIVE=1 + >=1 validated → GREEN, and REQUIRE_LIVE
|
||||
# unset + zero validated → GREEN (loud skip). CI can't provision a
|
||||
# live arm to prove this, so this unit test IS the regression gate:
|
||||
# a revert of the zero-validated→RED logic goes red on every PR.
|
||||
bash tests/e2e/test_require_live_priority_gate_unit.sh
|
||||
|
||||
- if: ${{ needs.changes.outputs.scripts == 'true' }}
|
||||
name: Test ECR promote-tenant-image script (mock-driven, no live infra)
|
||||
@@ -407,61 +389,61 @@ jobs:
|
||||
|
||||
# mc#959 root-fix (sre)
|
||||
|
||||
canvas-deploy-status:
|
||||
# core#2226: replaces the old advisory "Canvas Deploy Reminder". The canvas
|
||||
# image now has a real ORDERED auto-deploy (publish-canvas-image.yml:
|
||||
# build → push :staging-<sha> → wait green main CI → promote :latest by
|
||||
# digest), and docker-compose pins via CANVAS_IMAGE_TAG. There is no longer
|
||||
# a manual "go run docker compose pull by hand" step to remind operators
|
||||
# about — so this job just records, on a canvas-touching main push, that the
|
||||
# ordered deploy is handling it (and where to watch), instead of prescribing
|
||||
# a manual action that determinism made obsolete.
|
||||
name: Canvas Deploy Status
|
||||
canvas-deploy-reminder:
|
||||
name: Canvas Deploy Reminder
|
||||
runs-on: docker-host
|
||||
# Job-level `if:` so ci-required-drift.py's ci_job_names() detects this as
|
||||
# github.ref-gated and skips it from the required-context F1 set (mc#1982).
|
||||
# Step-level exit 0 handles the "not a canvas main push" case.
|
||||
# mc#1982 root-fix: added job-level `if:` so ci-required-drift.py's
|
||||
# ci_job_names() detects this as github.ref-gated and skips it from F1.
|
||||
# The step-level exit 0 handles the "not main push" case; the job-level
|
||||
# `if:` makes the gating explicit so the drift script sees it.
|
||||
# Runs on both main and staging pushes; step exits 0 when not applicable.
|
||||
if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging' }}
|
||||
needs: [changes, canvas-build]
|
||||
steps:
|
||||
- name: Record canvas ordered-deploy status
|
||||
- name: Write deploy reminder to step summary
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
CANVAS_CHANGED: ${{ needs.changes.outputs.canvas }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
REF_NAME: ${{ github.ref }}
|
||||
# github.server_url resolves via the workflow-level env override to the
|
||||
# Gitea instance, so RUN_URL points at the Gitea run page (not github.com).
|
||||
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions
|
||||
# github.server_url resolves via the workflow-level env override
|
||||
# to the Gitea instance, so the RUN_URL points at the Gitea run
|
||||
# page (not github.com). See feedback_act_runner_github_server_url.
|
||||
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ "$CANVAS_CHANGED" != "true" ] || [ "$EVENT_NAME" != "push" ] || [ "$REF_NAME" != "refs/heads/main" ]; then
|
||||
echo "Canvas deploy status not applicable for event=$EVENT_NAME ref=$REF_NAME canvas_changed=$CANVAS_CHANGED."
|
||||
echo "Canvas deploy reminder not applicable for event=$EVENT_NAME ref=$REF_NAME canvas_changed=$CANVAS_CHANGED."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Write body to a temp file — avoids backtick escaping in shell.
|
||||
cat > /tmp/deploy-status.md << 'BODY'
|
||||
## Canvas ordered deploy in progress — no manual action required
|
||||
cat > /tmp/deploy-reminder.md << 'BODY'
|
||||
## Canvas build passed — deploy required
|
||||
|
||||
This canvas-touching main push triggers `publish-canvas-image`, which now
|
||||
runs an ORDERED, CI-gated deploy (core#2226) — the same shape as the
|
||||
platform's deploy-production:
|
||||
The `publish-canvas-image` workflow is now building a fresh Docker image
|
||||
(`ghcr.io/molecule-ai/canvas:latest`) in the background.
|
||||
|
||||
1. Build → push `molecule-ai/canvas:staging-<sha>` + `:staging-latest`.
|
||||
2. Wait for green main CI on this SHA.
|
||||
3. Promote `:latest` to the verified `:staging-<sha>` by digest.
|
||||
Once it completes (~3–5 min), apply on the host machine with:
|
||||
```bash
|
||||
cd <runner-workspace>
|
||||
git pull origin main
|
||||
docker compose pull canvas && docker compose up -d canvas
|
||||
```
|
||||
|
||||
Tenants/hosts pin via `CANVAS_IMAGE_TAG` (default `latest` = the last
|
||||
CI-green build), so a deploy is reproducible — no hand-run
|
||||
`docker compose pull` needed. Watch the run in the canvas publish workflow.
|
||||
If you need to rebuild from local source instead (e.g. testing unreleased
|
||||
changes or a new `NEXT_PUBLIC_*` URL), use:
|
||||
```bash
|
||||
docker compose build canvas && docker compose up -d canvas
|
||||
```
|
||||
BODY
|
||||
printf '\n> Posted automatically by CI · commit `%s` · [publish workflow](%s)\n' \
|
||||
"$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-status.md
|
||||
printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
|
||||
"$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
|
||||
|
||||
# Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY, which
|
||||
# both GitHub and Gitea Actions render as the run's summary page.
|
||||
cat /tmp/deploy-status.md >> "$GITHUB_STEP_SUMMARY"
|
||||
# Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY,
|
||||
# which both GitHub Actions and Gitea Actions render as the
|
||||
# workflow run's summary page. (#75 / PR-D)
|
||||
cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
# Python Lint & Test — required check, always runs.
|
||||
# Runtime Python moved to molecule-ai-workspace-runtime. Keep this context as
|
||||
|
||||
+12
-110
@@ -123,9 +123,8 @@ jobs:
|
||||
# integration). See internal#512 for the class defect.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
api: ${{ steps.decide.outputs.api }}
|
||||
steps:
|
||||
@@ -161,9 +160,8 @@ jobs:
|
||||
# detect-changes for the full rationale.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
# Unique per-run container names so concurrent runs on the host-
|
||||
@@ -272,24 +270,6 @@ jobs:
|
||||
echo "::error::Redis did not become ready in 15s"
|
||||
docker logs "$REDIS_CONTAINER" || true
|
||||
exit 1
|
||||
- name: Set deterministic admin token for the e2e platform
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:164)
|
||||
# reads ADMIN_TOKEN. Setting it (a) closes isDevModeFailOpen (devmode.go:50
|
||||
# returns false when ADMIN_TOKEN is non-empty), so admin routes require a
|
||||
# bearer, and (b) makes Tier-2b accept a bearer that constant-time-equals
|
||||
# ADMIN_TOKEN. The platform process inherits ADMIN_TOKEN from $GITHUB_ENV.
|
||||
#
|
||||
# MOLECULE_ADMIN_TOKEN is the var the e2e scripts send as the bearer
|
||||
# (tests/e2e/_lib.sh:33 e2e_mint_workspace_token, and the run_mock
|
||||
# org-import curl). Set BOTH to the SAME value so the bearer the test
|
||||
# sends == the secret the platform checks. Deterministic test value;
|
||||
# this platform is ephemeral, single-run, and never reachable off-host.
|
||||
E2E_ADMIN_TOKEN="e2e-api-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
|
||||
echo "Admin token configured for the e2e platform (ADMIN_TOKEN + MOLECULE_ADMIN_TOKEN)."
|
||||
- name: Build platform
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
working-directory: workspace-server
|
||||
@@ -345,57 +325,19 @@ jobs:
|
||||
# start-redis steps point at this run's per-run host ports.
|
||||
./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
- name: Wait for /health (with migration completion gate)
|
||||
# Issue #2205: 30 one-second probes is insufficient when the migration
|
||||
# chain is still running; /health can flip true before migrations
|
||||
# finish, so subsequent steps that touch the DB fail. Hybrid fix:
|
||||
# bump timeout to 300s AND gate exit on the same workspaces-table
|
||||
# existence check the downstream "Assert migrations applied" uses.
|
||||
- name: Wait for /health
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# Readiness signal: the platform binds /health only AFTER the full
|
||||
# migration chain has been applied on cold start (it prints
|
||||
# "Platform starting on :PORT" at that point). So a 200 from /health
|
||||
# is the real "migrations done + server listening" signal.
|
||||
#
|
||||
# The migration chain grows every release, so a fixed ~30s budget is
|
||||
# brittle by construction (it WILL be exceeded as migrations accrue).
|
||||
# Use a generous wall-clock budget that comfortably exceeds
|
||||
# cold-start + full-migration time, polling fast. This is robust to a
|
||||
# growing chain WITHOUT masking a genuinely dead platform: if the
|
||||
# background platform-server process has exited (e.g. a broken
|
||||
# migration crashed it), we stop and fail loudly at once instead of
|
||||
# waiting out the whole budget.
|
||||
#
|
||||
# Issue #2205: /health can flip true before migrations finish on a
|
||||
# growing chain, so we gate exit on the workspaces-table existence
|
||||
# check the downstream "Assert migrations applied" uses.
|
||||
DEADLINE_SECS=300 # cold-start + full migration chain headroom
|
||||
PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
|
||||
start=$(date +%s)
|
||||
while :; do
|
||||
for i in $(seq 1 30); do
|
||||
if curl -sf "$BASE/health" > /dev/null; then
|
||||
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
|
||||
"SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'" 2>/dev/null || echo "0")
|
||||
if [ "$tables" = "1" ]; then
|
||||
echo "Platform healthy + migrations applied after $(( $(date +%s) - start ))s"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
# Fast-fail: if the platform process died, /health will never come.
|
||||
if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
|
||||
echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
|
||||
echo "::error::Platform did not become healthy with migrations applied within ${DEADLINE_SECS}s — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
echo "Platform up after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "::error::Platform did not become healthy in 30s"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
- name: Assert migrations applied
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
@@ -412,51 +354,11 @@ jobs:
|
||||
- name: Run E2E API tests
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_api.sh
|
||||
- name: Run keyless feature-contract E2E (terminal-diagnose / webhooks / budget / checkpoints / audit / traces / session-search / rescue / llm-billing-mode / resume / hibernate)
|
||||
# Keyless required-lane coverage for feature endpoints that ship without
|
||||
# an LLM key (runtime=external fixture). Each asserts the real HTTP
|
||||
# contract + a meaningful failure mode (401/400/fail-closed) so a
|
||||
# regression goes RED, not silently green. The mock-runtime A2A canned
|
||||
# round-trip is covered by the priority-runtimes `mock` arm, not here.
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_keyless_feature_contracts_e2e.sh
|
||||
- name: Run secrets-dispatch contract test (keyless SECRETS_JSON branch order)
|
||||
# Previously orphaned (no workflow referenced it). Hermetic unit-style
|
||||
# contract over test_staging_full_saas.sh's LLM-key branch precedence —
|
||||
# needs no platform, no bearer, no network. Guards the 2026-05-03
|
||||
# "wrong key shape wins" incident class.
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_secrets_dispatch.sh
|
||||
- name: Run notify-with-attachments E2E
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_notify_attachments_e2e.sh
|
||||
- name: "Run priority-runtimes E2E (REQUIRE-LIVE: mock validates the runtime plumbing end-to-end)"
|
||||
# E2E_REQUIRE_LIVE=1 is ON: the run MUST validate >=1 runtime end-to-end
|
||||
# or it exits NON-zero (RED). This is now SAFE because the `mock` arm can
|
||||
# actually provision in CI: the only blocker was that POST /org/import and
|
||||
# POST /admin/workspaces/:id/tokens are AdminAuth-gated
|
||||
# (router.go:778 + :427) and this job previously configured NO admin token,
|
||||
# so every admin call 401'd ("admin auth required"). The "Set deterministic
|
||||
# admin token" step above now sets ADMIN_TOKEN on the platform AND exports
|
||||
# the matching MOLECULE_ADMIN_TOKEN the e2e scripts send as the bearer, so
|
||||
# the mock arm can org-import → online → mint token → canned A2A reply →
|
||||
# validated(). That guarantees VALIDATED>=1 on a healthy platform, so the
|
||||
# REQUIRED `E2E API Smoke Test` gate now HONESTLY validates a runtime
|
||||
# end-to-end; if the mock plumbing (DB insert, status flip, A2A proxy,
|
||||
# activity logging, or the admin-auth wiring) genuinely breaks, the gate
|
||||
# goes RED instead of false-green. The zero-validated→RED decision is also
|
||||
# regression-gated WITHOUT provisioning by the bash unit test
|
||||
# tests/e2e/test_require_live_priority_gate_unit.sh (wired into ci.yml's
|
||||
# "Run E2E bash unit tests" job), so a revert of that logic still fails CI.
|
||||
#
|
||||
# MiniMax stays an OPPORTUNISTIC best-effort arm: create is registry-fragile
|
||||
# in CI (422 UNREGISTERED_MODEL_FOR_RUNTIME), so a miss is reported via
|
||||
# bestfail() and never reds the gate — mock carries the required validation,
|
||||
# MiniMax is a bonus real-LLM check when it comes up. ZERO new credentials.
|
||||
- name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
env:
|
||||
E2E_REQUIRE_LIVE: '1'
|
||||
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
|
||||
run: bash tests/e2e/test_priority_runtimes_e2e.sh
|
||||
- name: Install standalone runtime parser from Gitea registry
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
|
||||
+14
-113
@@ -113,28 +113,6 @@ jobs:
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
#
|
||||
# PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
|
||||
# without CTO sign-off, that's the irreversible call):
|
||||
# NOW FAIL-CLOSED:
|
||||
# - Postgres/Redis/platform/canvas readiness are already bounded
|
||||
# readiness-polls that hard-fail (and dump logs) at their deadline,
|
||||
# not fixed sleeps — preserved.
|
||||
# - passWithNoTests:false + forbidOnly (playwright.config.ts) → a
|
||||
# renamed/moved spec or stray test.only can no longer green the lane.
|
||||
# - REQUIRE-LIVE guard in "Run Playwright E2E tests" → chat==true must
|
||||
# actually execute >=1 test, else exit 1.
|
||||
# - chat-desktop "activity log" test no longer swallows its assertion.
|
||||
# STILL BLOCKS PROMOTION:
|
||||
# - The echo round-trip asserts on rendered "Echo: ..." text but never
|
||||
# asserts the echo runtime actually RECEIVED the A2A request
|
||||
# (fixtures/echo-runtime.ts exposes lastRequest, unused) — an
|
||||
# optimistic client-side render could pass without a real round-trip.
|
||||
# Add a server-received assertion before required.
|
||||
# - The "No-op pass" path (detect-changes chat!=true) is a legitimate
|
||||
# paths-filter skip, but a required gate needs it to be a neutral
|
||||
# check, not a green "success", so a skipped heavy lane can't be
|
||||
# mistaken for a passed one.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
@@ -249,20 +227,6 @@ jobs:
|
||||
echo "CANVAS_PORT=${CANVAS_PORT}" >> "$GITHUB_ENV"
|
||||
echo "Canvas host port: ${CANVAS_PORT}"
|
||||
|
||||
- name: Set deterministic admin token
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
run: |
|
||||
# PR #2291 made auth fail-closed everywhere (no dev-mode escape).
|
||||
# The platform server requires ADMIN_TOKEN; the canvas requires the
|
||||
# matching NEXT_PUBLIC_ADMIN_TOKEN or every API call 401s.
|
||||
# We set a deterministic per-run value so the ephemeral platform
|
||||
# and canvas are paired correctly.
|
||||
E2E_ADMIN_TOKEN="e2e-chat-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
|
||||
echo "NEXT_PUBLIC_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
|
||||
echo "Admin token configured for e2e-chat platform + canvas."
|
||||
|
||||
- name: Start platform (background)
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
working-directory: workspace-server
|
||||
@@ -278,36 +242,16 @@ jobs:
|
||||
- name: Wait for /health
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
run: |
|
||||
# Readiness signal: the platform binds /health only AFTER the full
|
||||
# migration chain has been applied on cold start (it prints
|
||||
# "Platform starting on :PORT" at that point). So a 200 from /health
|
||||
# is the real "migrations done + server listening" signal.
|
||||
#
|
||||
# The migration chain grows every release, so a fixed ~30s budget is
|
||||
# brittle by construction. Use a generous wall-clock budget that
|
||||
# comfortably exceeds cold-start + full-migration time, polling fast.
|
||||
# Robust to a growing chain WITHOUT masking a dead platform: if the
|
||||
# background platform-server process has exited, fail loudly at once.
|
||||
DEADLINE_SECS=180 # cold-start + full migration chain headroom
|
||||
PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
|
||||
start=$(date +%s)
|
||||
while :; do
|
||||
for i in $(seq 1 30); do
|
||||
if curl -sf "http://127.0.0.1:${PLATFORM_PORT}/health" > /dev/null; then
|
||||
echo "Platform healthy after $(( $(date +%s) - start ))s"
|
||||
echo "Platform up after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
|
||||
echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
|
||||
echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "::error::Platform did not become healthy in 30s"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
|
||||
- name: Install canvas dependencies
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
@@ -334,68 +278,25 @@ jobs:
|
||||
export NEXT_PUBLIC_WS_URL="ws://127.0.0.1:${PLATFORM_PORT}/ws"
|
||||
npx next dev --turbopack -p "${CANVAS_PORT}" > canvas.log 2>&1 &
|
||||
echo $! > canvas.pid
|
||||
# Readiness must wait for the actual chat route to *compile*, not
|
||||
# just for the dev server to bind the port. `next dev --turbopack`
|
||||
# accepts the TCP connection well before it has compiled a route
|
||||
# on first request, so a bare `curl /` can 200 (or hang) while the
|
||||
# page the tests load is still building. We therefore probe the
|
||||
# real route the specs navigate to (`/?m=chat`) and require a 2xx,
|
||||
# which only happens once Turbopack has finished the first
|
||||
# compile. The previous 30s budget was also too tight for a cold
|
||||
# Turbopack first-compile on a loaded operator-host runner — the
|
||||
# `Canvas did not start in 30s` flake. Raise to 120s (job
|
||||
# timeout-minutes is 15, so this is comfortably bounded) and probe
|
||||
# every 2s.
|
||||
READY=""
|
||||
for i in $(seq 1 60); do
|
||||
# Tempfile-routed -w + set +e/-e prevents curl-exit-code
|
||||
# pollution of the captured status (lint-curl-status-capture.yml).
|
||||
set +e
|
||||
curl -s -o /dev/null -w '%{http_code}' "http://localhost:${CANVAS_PORT}/?m=chat" > /tmp/canvas-ready.code
|
||||
set -e
|
||||
CODE=$(cat /tmp/canvas-ready.code 2>/dev/null || echo "000")
|
||||
if [ "$CODE" -ge 200 ] && [ "$CODE" -lt 400 ]; then
|
||||
echo "Canvas (chat route compiled) up after ~$((i*2))s (HTTP ${CODE})"
|
||||
READY=1
|
||||
break
|
||||
for i in $(seq 1 30); do
|
||||
if curl -sf "http://localhost:${CANVAS_PORT}" > /dev/null 2>&1; then
|
||||
echo "Canvas up after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
sleep 2
|
||||
sleep 1
|
||||
done
|
||||
if [ -z "$READY" ]; then
|
||||
echo "::error::Canvas chat route did not compile in 120s (last HTTP ${CODE})"
|
||||
cat canvas.log || true
|
||||
exit 1
|
||||
fi
|
||||
echo "::error::Canvas did not start in 30s"
|
||||
cat canvas.log || true
|
||||
exit 1
|
||||
|
||||
- name: Run Playwright E2E tests
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
working-directory: canvas
|
||||
env:
|
||||
# CI=1 activates forbidOnly in playwright.config.ts (a stray
|
||||
# `test.only` would otherwise green the suite while skipping the
|
||||
# rest). passWithNoTests:false (also in the config) already makes
|
||||
# a zero-match selection exit non-zero.
|
||||
CI: "1"
|
||||
run: |
|
||||
set -euo pipefail
|
||||
export E2E_PLATFORM_URL="http://127.0.0.1:${PLATFORM_PORT}"
|
||||
export E2E_DATABASE_URL="${DATABASE_URL}"
|
||||
export PLAYWRIGHT_BASE_URL="http://localhost:${CANVAS_PORT}"
|
||||
|
||||
# REQUIRE-LIVE guard (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE):
|
||||
# this lane reached here only because detect-changes said chat==true,
|
||||
# so it MUST actually execute the round-trip specs. `pipefail` makes
|
||||
# a real test failure (playwright non-zero) abort here under `set -e`;
|
||||
# passWithNoTests:false makes a zero-match selection non-zero too. The
|
||||
# explicit grep below is belt-and-braces: assert the list reporter
|
||||
# printed an executed-count summary, so a silent all-skip / no-op can
|
||||
# never report green.
|
||||
npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts \
|
||||
--reporter=list 2>&1 | tee /tmp/pw-chat.out
|
||||
if ! grep -qE '[0-9]+ (passed|failed|skipped)' /tmp/pw-chat.out; then
|
||||
echo "::error::E2E Chat REQUIRE-LIVE: chat==true but Playwright reported no executed tests — specs missing or all-skipped, refusing to report green."
|
||||
exit 1
|
||||
fi
|
||||
npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts
|
||||
|
||||
- name: Dump platform log on failure
|
||||
if: failure() && needs.detect-changes.outputs.chat == 'true'
|
||||
|
||||
@@ -130,37 +130,13 @@ jobs:
|
||||
run: |
|
||||
set -euo pipefail
|
||||
./workspace-server/platform-server > workspace-server/platform.log 2>&1 &
|
||||
PLATFORM_PID=$!
|
||||
echo "$PLATFORM_PID" > workspace-server/platform.pid
|
||||
# Readiness signal: the platform binds /health only AFTER the full
|
||||
# migration chain has been applied on cold start (it prints
|
||||
# "Platform starting on :PORT" at that point). So a 200 from /health
|
||||
# is the real "migrations done + server listening" signal.
|
||||
#
|
||||
# The migration chain grows every release, so a fixed ~30s budget is
|
||||
# brittle by construction. Use a generous wall-clock budget that
|
||||
# comfortably exceeds cold-start + full-migration time, polling fast.
|
||||
# Robust to a growing chain WITHOUT masking a dead platform: if the
|
||||
# background platform-server process has exited, fail loudly at once.
|
||||
DEADLINE_SECS=180 # cold-start + full migration chain headroom
|
||||
start=$(date +%s)
|
||||
while :; do
|
||||
if curl -sf "$BASE/health" >/dev/null; then
|
||||
echo "Platform healthy after $(( $(date +%s) - start ))s"
|
||||
exit 0
|
||||
fi
|
||||
if ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
|
||||
echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
|
||||
echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
echo $! > workspace-server/platform.pid
|
||||
for i in $(seq 1 30); do
|
||||
curl -sf "$BASE/health" >/dev/null && exit 0
|
||||
sleep 1
|
||||
done
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
|
||||
- name: Run comprehensive E2E
|
||||
run: bash tests/e2e/test_comprehensive_e2e.sh
|
||||
|
||||
@@ -126,7 +126,6 @@ jobs:
|
||||
# push/dispatch/cron only (30+ min). This is NOT a fake-green mask of
|
||||
# the real assertion — it validates the driving script's bash syntax
|
||||
# and inline-python so a broken test script fails at PR time.
|
||||
# bp-required: pending #1296 — PR emitter, not yet required (tracked in #1296).
|
||||
pr-validate:
|
||||
name: E2E Peer Visibility
|
||||
runs-on: ubuntu-latest
|
||||
@@ -268,36 +267,12 @@ jobs:
|
||||
echo $! > platform.pid
|
||||
- name: Wait for /health
|
||||
run: |
|
||||
# Readiness signal: the platform binds /health only AFTER the full
|
||||
# migration chain has been applied on cold start (it prints
|
||||
# "Platform starting on :PORT" at that point). So a 200 from /health
|
||||
# is the real "migrations done + server listening" signal.
|
||||
#
|
||||
# The migration chain grows every release, so a fixed ~30s budget is
|
||||
# brittle by construction. Use a generous wall-clock budget that
|
||||
# comfortably exceeds cold-start + full-migration time, polling fast.
|
||||
# Robust to a growing chain WITHOUT masking a dead platform: if the
|
||||
# background platform-server process has exited, fail loudly at once.
|
||||
DEADLINE_SECS=180 # cold-start + full migration chain headroom
|
||||
PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
|
||||
start=$(date +%s)
|
||||
while :; do
|
||||
if curl -sf "$BASE/health" > /dev/null; then
|
||||
echo "Platform healthy after $(( $(date +%s) - start ))s"
|
||||
exit 0
|
||||
fi
|
||||
if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
|
||||
echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
|
||||
echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
|
||||
cat workspace-server/platform.log || true
|
||||
exit 1
|
||||
fi
|
||||
for i in $(seq 1 30); do
|
||||
curl -sf "$BASE/health" > /dev/null && { echo "Platform up after ${i}s"; exit 0; }
|
||||
sleep 1
|
||||
done
|
||||
echo "::error::Platform did not become healthy in 30s"
|
||||
cat workspace-server/platform.log || true; exit 1
|
||||
- name: Run LOCAL fresh-provision peer-visibility E2E (literal MCP list_peers)
|
||||
# HONEST gate — NO continue-on-error. The local backend uses
|
||||
# external-mode workspaces so this context tests the literal MCP
|
||||
|
||||
@@ -12,30 +12,9 @@ name: E2E Staging Canvas (Playwright)
|
||||
#
|
||||
|
||||
# Playwright test suite that provisions a fresh staging org per run and
|
||||
# verifies every workspace-panel tab renders REAL content (not just an
|
||||
# empty/errored container). Complements e2e-staging-saas.yml (which tests
|
||||
# the API shape) by exercising the actual browser + canvas bundle against
|
||||
# live staging.
|
||||
#
|
||||
# PROMOTION-READINESS (toward making this a HARD merge-gate):
|
||||
# NOW RELIABLE (spec hardened — staging-tabs.spec.ts):
|
||||
# - All waits condition-based (toBeVisible/toHaveAttribute/expect.poll);
|
||||
# no fixed waitForTimeout in the spec.
|
||||
# - Tabs asserted on settled REAL content, not "container visible".
|
||||
# - ErrorBoundary + visible error alerts fail non-degraded tabs.
|
||||
# - Tab-list parity-checked vs live DOM; fail-closed on missing tenant.
|
||||
# STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT remove continue-on-error —
|
||||
# CTO-owned, RFC internal#219 §1):
|
||||
# - Infra dependency: real staging EC2 per run (12-20 min cold boot);
|
||||
# AWS/Cloudflare/CP availability would become merge-blockers.
|
||||
# - Shared-zone TLS/DNS/ACME propagation flake surface is upstream of
|
||||
# this repo and outside its control.
|
||||
# - Required-gate correctness needs CP_STAGING_ADMIN_API_TOKEN GUARANTEED
|
||||
# present; today's skip-if-absent (core#2225) is right for non-gating
|
||||
# but would skip-green a required check.
|
||||
# - Single hermes/platform_managed workspace; agent-dependent content
|
||||
# (live chat/traces round-trip) not exercised on staging (#2162).
|
||||
# The full checklist lives at the foot of canvas/e2e/staging-tabs.spec.ts.
|
||||
# verifies every workspace-panel tab renders without crashing. Complements
|
||||
# e2e-staging-saas.yml (which tests the API shape) by exercising the
|
||||
# actual browser + canvas bundle against live staging.
|
||||
#
|
||||
# Triggers: push to main, PR touching canvas sources + this workflow only
|
||||
# after the PR enters `merge-queue`, manual dispatch, and scheduled cron to
|
||||
@@ -188,30 +167,16 @@ jobs:
|
||||
- if: needs.detect-changes.outputs.canvas == 'true'
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
# Skip-if-absent (core#2225), mirroring the serving-e2e gate's
|
||||
# skip-if-secret-unset contract: a MISSING CI secret is an operator
|
||||
# CONFIG gap, not a code regression, so it must not paint this E2E
|
||||
# red. When CP_STAGING_ADMIN_API_TOKEN is unset we emit a LOUD
|
||||
# ::warning:: + ::notice:: and skip the real provision/test steps (the
|
||||
# job still completes green). When the secret IS present we run the
|
||||
# full suite exactly as before. Operators: set
|
||||
# CP_STAGING_ADMIN_API_TOKEN as a repo/org Actions secret on
|
||||
# molecule-core to actually exercise this E2E.
|
||||
- name: Check admin token (skip-if-absent)
|
||||
id: token_check
|
||||
- name: Verify admin token present
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::warning::CP_STAGING_ADMIN_API_TOKEN is not set on this runner — SKIPPING the staging canvas E2E (cannot auth to staging CP). This is an operator config gap, not a code failure; set the secret on molecule-core (repo or org Actions secrets) to run it. See core#2225."
|
||||
echo "::notice::E2E Staging Canvas skipped: CP_STAGING_ADMIN_API_TOKEN absent."
|
||||
echo "present=false" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "CP_STAGING_ADMIN_API_TOKEN present ✓ — running staging canvas E2E."
|
||||
echo "present=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::error::Missing CP_STAGING_ADMIN_API_TOKEN"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Set up Node
|
||||
if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
|
||||
with:
|
||||
node-version: '20'
|
||||
@@ -219,11 +184,11 @@ jobs:
|
||||
cache-dependency-path: canvas/package-lock.json
|
||||
|
||||
- name: Install canvas deps
|
||||
if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
run: npm ci
|
||||
|
||||
- name: Install Playwright browsers
|
||||
if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
PREBAKED_PLAYWRIGHT=/ms-playwright
|
||||
@@ -235,7 +200,7 @@ jobs:
|
||||
npx playwright install --with-deps chromium
|
||||
|
||||
- name: Run staging canvas E2E
|
||||
if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
run: npx playwright test --config=playwright.staging.config.ts
|
||||
|
||||
- name: Upload Playwright report on failure
|
||||
|
||||
@@ -85,25 +85,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
#
|
||||
# PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
|
||||
# without CTO sign-off, that's the irreversible call):
|
||||
# NOW FAIL-CLOSED:
|
||||
# - Missing CP_STAGING_ADMIN_API_TOKEN → hard exit 2 (preflight).
|
||||
# - Staging CP unhealthy → hard exit 1 (preflight, not a workspace bug).
|
||||
# - Harness E2E_REQUIRE_LIVE=1 → exit 5 if a clean exit didn't prove
|
||||
# all four awaiting_agent transitions (no silent skip).
|
||||
# - Sweep transition (step 6) is now a bounded readiness-poll, not a
|
||||
# fixed sleep + one-shot assert → no more sweep-cadence flake.
|
||||
# - register / re-register retry ONLY transient edge 5xx (bounded),
|
||||
# fail closed on 4xx → no more cold-boot-502 flake.
|
||||
# STILL BLOCKS PROMOTION:
|
||||
# - Single shared staging tenant + EC2 quota window: an infra-side
|
||||
# provisioning outage (not a code bug) would turn the gate red.
|
||||
# Needs an infra-class vs code-class signal split before required.
|
||||
# - "CP unhealthy → exit 1" currently looks identical to a real
|
||||
# failure on the run page; required-gate would need it demoted to
|
||||
# a neutral/skip so staging flakiness can't block merges.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 25
|
||||
|
||||
@@ -143,15 +124,6 @@ jobs:
|
||||
|
||||
- name: Run external-runtime E2E
|
||||
id: e2e
|
||||
# E2E_REQUIRE_LIVE=1: the harness fails CLOSED (exit 5) if it ever
|
||||
# reaches a clean exit without proving all four awaiting_agent
|
||||
# transitions. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE — a
|
||||
# silent skip / early-return / dropped assertion can no longer
|
||||
# masquerade as green. Token-missing and CP-unhealthy already
|
||||
# hard-fail in the two preflight steps above, so reaching this step
|
||||
# means a real cycle is expected.
|
||||
env:
|
||||
E2E_REQUIRE_LIVE: "1"
|
||||
run: bash tests/e2e/test_staging_external_runtime.sh
|
||||
|
||||
# Mirror the e2e-staging-saas.yml safety net: if the runner is
|
||||
|
||||
@@ -1,199 +0,0 @@
|
||||
name: E2E Staging Reconciler (heals terminated EC2)
|
||||
|
||||
# Live staging proof for the core#2261 instance-state reconciler
|
||||
# (workspace-server/internal/registry/cp_instance_reconciler.go). The
|
||||
# real-infra complement to the deterministic unit tests: provisions a real
|
||||
# staging workspace, TERMINATES its EC2, and asserts the reconciler flips it
|
||||
# off 'online' (PRIMARY gate) and auto-reprovisions on a new instance_id
|
||||
# (SECONDARY, best-effort). See
|
||||
# tests/e2e/test_reconciler_heals_terminated_instance.sh for the assertion
|
||||
# contract + timeouts.
|
||||
#
|
||||
# Modeled on e2e-staging-saas.yml. Same secrets + same Gitea-port caveats:
|
||||
# - Dropped workflow_dispatch.inputs (Gitea 1.22.6 parser rejects them).
|
||||
# - Dropped merge_group / environment (no Gitea equivalent).
|
||||
# - Workflow-level env.GITHUB_SERVER_URL pinned per
|
||||
# feedback_act_runner_github_server_url.
|
||||
#
|
||||
# NOT a required check (yet). This is a brand-new live E2E that provisions +
|
||||
# terminates real EC2 (costs money, shares the cp#245 cold-boot flake
|
||||
# surface). A new live e2e must NOT hard-gate every merge until it has a
|
||||
# green track record. continue-on-error: true surfaces failures without
|
||||
# blocking. PROMOTE to branch-required (flip continue-on-error → false AND
|
||||
# add "E2E Staging Reconciler" to branch protection) once it has run green on
|
||||
# main for several consecutive days — same de-flake discipline the
|
||||
# platform-boot job in e2e-staging-saas.yml documents.
|
||||
|
||||
on:
|
||||
# Run when the reconciler itself, the script, or the libs it depends on
|
||||
# change — so a reconciler regression is caught on the PR that introduces
|
||||
# it (paths filter), plus a daily schedule to catch infra/AMI drift.
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/registry/cp_instance_reconciler.go'
|
||||
- 'tests/e2e/test_reconciler_heals_terminated_instance.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- '.gitea/workflows/e2e-staging-reconciler.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/registry/cp_instance_reconciler.go'
|
||||
- 'tests/e2e/test_reconciler_heals_terminated_instance.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- '.gitea/workflows/e2e-staging-reconciler.yml'
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# 08:00 UTC daily — offset from e2e-staging-saas (07:00) so the two live
|
||||
# harnesses don't fight over staging's per-hour org-creation quota.
|
||||
- cron: '0 8 * * *'
|
||||
|
||||
# Serialize against itself: staging has a finite per-hour org-creation quota,
|
||||
# and a cancelled run mid-teardown leaks EC2. cancel-in-progress: false
|
||||
# mirrors e2e-staging-saas.yml.
|
||||
concurrency:
|
||||
group: e2e-staging-reconciler
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
jobs:
|
||||
# PR-validation path: always posts success so a workflow-only / script-only
|
||||
# PR has a status check (this workflow's real job only fires on the paths
|
||||
# filter). Mirrors the pr-validate job in e2e-staging-saas.yml.
|
||||
pr-validate:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
continue-on-error: true
|
||||
- name: YAML validation (best-effort)
|
||||
run: |
|
||||
echo "e2e-staging-reconciler.yml — PR validation: workflow YAML is valid."
|
||||
echo "Live E2E step runs only when the reconciler / script / libs change."
|
||||
continue-on-error: true
|
||||
|
||||
e2e-staging-reconciler:
|
||||
name: E2E Staging Reconciler
|
||||
runs-on: ubuntu-latest
|
||||
# NOT required yet — surface failures without blocking merges. Flip to
|
||||
# false + add to branch protection once green on main for a de-flake
|
||||
# window (see the header note). mc#1982: do not renew this mask silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 60
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
# Single admin-bearer secret drives provision + tenant-token retrieval +
|
||||
# teardown (= Railway staging CP_ADMIN_API_TOKEN). Same secret name the
|
||||
# saas workflow canonicalised to under internal#322.
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
# Leak-check is REQUIRED here: this test deliberately terminates an EC2,
|
||||
# so teardown MUST positively confirm no slug-tagged box survives.
|
||||
E2E_AWS_LEAK_CHECK: required
|
||||
E2E_AWS_TERMINATE_LEAKS: '1'
|
||||
# claude-code + MiniMax is the cheapest boot-to-online path (same as the
|
||||
# saas job). The reconciler test never makes a completion, but the key is
|
||||
# wired so the first boot reaches online on the same path the saas
|
||||
# harness uses. First non-empty wins in the script's priority chain.
|
||||
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
|
||||
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
|
||||
E2E_RUNTIME: claude-code
|
||||
# Platform-managed create path (moonshot/kimi-k2.6, no tenant key) — the
|
||||
# combo proven to create cleanly; this test only needs the ws online.
|
||||
E2E_LLM_PATH: platform
|
||||
E2E_MODEL_SLUG: MiniMax-M2
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify required secrets present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
echo "::error::$var not set — this test terminates an EC2 and verifies no leak; AWS creds are mandatory"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
echo "Required secrets present ✓"
|
||||
|
||||
- name: CP staging health preflight
|
||||
run: |
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a reconciler bug."
|
||||
exit 1
|
||||
fi
|
||||
echo "Staging CP healthy ✓"
|
||||
|
||||
- name: Run reconciler heal E2E
|
||||
id: e2e
|
||||
run: bash tests/e2e/test_reconciler_heals_terminated_instance.sh
|
||||
|
||||
# Belt-and-braces teardown: the script installs its own EXIT trap, but if
|
||||
# the runner is cancelled the trap may not fire. This always() step
|
||||
# double-deletes any e2e-rec-* org from THIS run. The admin DELETE is
|
||||
# idempotent so double-invoking is safe.
|
||||
- name: Teardown safety net (runs on cancel/failure)
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, os, datetime
|
||||
run_id = os.environ.get('GITHUB_RUN_ID', '')
|
||||
d = json.load(sys.stdin)
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
|
||||
# Slug shape: e2e-rec-YYYYMMDD-<run_id>-<attempt>-...
|
||||
if run_id:
|
||||
prefixes = tuple(f'e2e-rec-{d}-{run_id}-' for d in dates)
|
||||
else:
|
||||
prefixes = tuple(f'e2e-rec-{d}-' for d in dates)
|
||||
candidates = [o['slug'] for o in d.get('orgs', [])
|
||||
if any(o.get('slug','').startswith(p) for p in prefixes)
|
||||
and o.get('instance_status') not in ('purged',)]
|
||||
print('\n'.join(candidates))
|
||||
" 2>/dev/null)
|
||||
leaks=()
|
||||
for slug in $orgs; do
|
||||
echo "Safety-net teardown: $slug"
|
||||
set +e
|
||||
curl -sS -o /tmp/rec-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/rec-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/rec-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::reconciler teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/rec-cleanup.out 2>/dev/null)"
|
||||
leaks+=("$slug")
|
||||
fi
|
||||
done
|
||||
if [ ${#leaks[@]} -gt 0 ]; then
|
||||
echo "::warning::reconciler teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
exit 0
|
||||
@@ -48,10 +48,8 @@ on:
|
||||
- 'workspace-server/internal/handlers/a2a_proxy.go'
|
||||
- 'workspace-server/internal/middleware/**'
|
||||
- 'workspace-server/internal/provisioner/**'
|
||||
- 'workspace-server/internal/providers/providers.yaml'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- 'tests/e2e/lib/completion_assert.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/test_aws_leak_check.sh'
|
||||
- '.gitea/workflows/e2e-staging-saas.yml'
|
||||
@@ -63,10 +61,8 @@ on:
|
||||
- 'workspace-server/internal/handlers/a2a_proxy.go'
|
||||
- 'workspace-server/internal/middleware/**'
|
||||
- 'workspace-server/internal/provisioner/**'
|
||||
- 'workspace-server/internal/providers/providers.yaml'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- 'tests/e2e/lib/completion_assert.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/test_aws_leak_check.sh'
|
||||
- '.gitea/workflows/e2e-staging-saas.yml'
|
||||
@@ -172,23 +168,9 @@ jobs:
|
||||
# and defeats the cost saving. Operators can override via the
|
||||
# workflow_dispatch flow (no input wired here yet — runtime
|
||||
# override is enough for ad-hoc).
|
||||
#
|
||||
# #2263 deploy-skew: the claude-code default is the COLON-namespaced BYOK
|
||||
# id `minimax:MiniMax-M2.7`, NOT bare `MiniMax-M2`. The deployed staging
|
||||
# ws-server's compiled registry can lag source; validateRegisteredModelForRuntime
|
||||
# 400s the bare form on an older image (the sibling Platform Boot job, on
|
||||
# the SAME image, succeeds with namespaced `moonshot/kimi-k2.6`). The colon
|
||||
# form stays in the BYOK `minimax` arm (providers.yaml:851) so it resolves
|
||||
# provider=minimax (BYOK) and the #1994 byok-not-platform guard still
|
||||
# passes — the slash/platform form `minimax/MiniMax-M2.7` would not.
|
||||
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'minimax:MiniMax-M2.7' }}
|
||||
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2' }}
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
# Fail-closed-on-skip: in CI the harness MUST prove ≥1 full
|
||||
# provision→online→A2A cycle. If it reaches the end having validated
|
||||
# nothing (a future short-circuit / skip path), it exits 5 rather than
|
||||
# reporting a false green. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE.
|
||||
E2E_REQUIRE_LIVE: '1'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -333,152 +315,3 @@ jobs:
|
||||
echo "::warning::saas teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
exit 0
|
||||
|
||||
# ── PLATFORM-MANAGED BOOT REGRESSION (moonshot/kimi NOT_CONFIGURED) ──────────
|
||||
#
|
||||
# The REAL-boot complement to the deterministic unit suite
|
||||
# (workspace_provision_platform_boot_test.go). Provisions a REAL staging
|
||||
# claude-code workspace on the PLATFORM-managed path — provider=platform,
|
||||
# model=moonshot/kimi-k2.6, NO tenant LLM key — and asserts it reaches
|
||||
# status=online (NOT not_configured) and a completion returns 200, via the same
|
||||
# online-wait + completion-assert the BYOK job uses.
|
||||
#
|
||||
# Why a SEPARATE job (not a matrix leg of e2e-staging-saas): the platform path
|
||||
# injects NO secret and pins a different model, so its env block diverges from
|
||||
# the BYOK job's. A dedicated job keeps each path's "verify key present" preflight
|
||||
# honest (BYOK requires a key; platform requires its ABSENCE not to matter) and
|
||||
# gives the regression its own named commit-status for branch protection.
|
||||
#
|
||||
# Add `E2E Staging Platform Boot` to branch protection after 3 consecutive
|
||||
# green runs on main (de-flake window; this path shares the cp#245
|
||||
# boot-timeout flake surface the BYOK job has, so it must prove stable before
|
||||
# it can BLOCK — see the gate-making plan in the PR body).
|
||||
# bp-required: pending #2187
|
||||
e2e-staging-platform-boot:
|
||||
name: E2E Staging Platform Boot
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface without blocking until the de-flake window
|
||||
# closes. mc#1982: do NOT renew this mask silently — the gate-making plan
|
||||
# tracks the flip to false under #2187.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 45
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
E2E_AWS_LEAK_CHECK: required
|
||||
E2E_AWS_TERMINATE_LEAKS: '1'
|
||||
# The regression combo: claude-code + platform-managed + moonshot/kimi-k2.6.
|
||||
# NO E2E_*_API_KEY is set — platform-managed billing is owned by Molecule via
|
||||
# the CP LLM proxy. The harness's E2E_LLM_PATH=platform branch sends empty
|
||||
# secrets and pin-selects the platform model.
|
||||
E2E_RUNTIME: claude-code
|
||||
E2E_LLM_PATH: platform
|
||||
# Smoke mode: a single parent workspace is enough to prove online +
|
||||
# completion for the platform path (the A2A/delegation matrix is the BYOK
|
||||
# job's job). Override E2E_DEFAULT_PLATFORM_MODEL via workflow_dispatch to
|
||||
# exercise another platform model id.
|
||||
E2E_MODE: smoke
|
||||
E2E_RUN_ID: "platform-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
# Fail-closed-on-skip (see BYOK job). smoke mode still runs steps 2/4/7/8b,
|
||||
# so all four required milestones (provisioned/tenant_online/
|
||||
# workspace_online/a2a_roundtrip) fire — the guard is valid for this lane too.
|
||||
E2E_REQUIRE_LIVE: '1'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
echo "::error::$var not set — EC2 leak verification cannot run"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: Assert NO BYOK key leaks into the platform run
|
||||
run: |
|
||||
# The whole point of this job is the platform-managed path. A stray
|
||||
# E2E_*_API_KEY in the runner env would (via the harness) still be
|
||||
# skipped by the E2E_LLM_PATH=platform branch — but assert their
|
||||
# absence loudly here so a future env edit can't silently convert this
|
||||
# into a masked BYOK run that no longer exercises the regression.
|
||||
for var in E2E_MINIMAX_API_KEY E2E_ANTHROPIC_API_KEY E2E_OPENAI_API_KEY; do
|
||||
if [ -n "${!var:-}" ]; then
|
||||
echo "::warning::$var is set in this platform-boot job's env — the harness ignores it on E2E_LLM_PATH=platform, but it should not be wired here."
|
||||
fi
|
||||
done
|
||||
echo "Platform-managed path: no tenant LLM key required ✓"
|
||||
|
||||
- name: CP staging health preflight
|
||||
run: |
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
|
||||
exit 1
|
||||
fi
|
||||
echo "Staging CP healthy ✓"
|
||||
|
||||
- name: Run platform-managed boot E2E (online + completion)
|
||||
id: e2e
|
||||
run: bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
- name: Teardown safety net (runs on cancel/failure)
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, os, datetime
|
||||
run_id = os.environ.get('GITHUB_RUN_ID', '')
|
||||
d = json.load(sys.stdin)
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
|
||||
# smoke mode slugs are e2e-smoke-YYYYMMDD-platform-<run_id>-...
|
||||
if run_id:
|
||||
prefixes = tuple(f'e2e-smoke-{d}-platform-{run_id}-' for d in dates)
|
||||
else:
|
||||
prefixes = tuple(f'e2e-smoke-{d}-platform-' for d in dates)
|
||||
candidates = [o['slug'] for o in d.get('orgs', [])
|
||||
if any(o.get('slug','').startswith(p) for p in prefixes)
|
||||
and o.get('instance_status') not in ('purged',)]
|
||||
print('\n'.join(candidates))
|
||||
" 2>/dev/null)
|
||||
leaks=()
|
||||
for slug in $orgs; do
|
||||
echo "Safety-net teardown: $slug"
|
||||
set +e
|
||||
curl -sS -o /tmp/plat-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/plat-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/plat-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::platform-boot teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/plat-cleanup.out 2>/dev/null)"
|
||||
leaks+=("$slug")
|
||||
fi
|
||||
done
|
||||
if [ ${#leaks[@]} -gt 0 ]; then
|
||||
echo "::warning::platform-boot teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
exit 0
|
||||
|
||||
@@ -88,9 +88,8 @@ jobs:
|
||||
# surprises and keeps the routing rule discoverable in one place.
|
||||
runs-on: docker-host
|
||||
# mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
handlers: ${{ steps.filter.outputs.handlers }}
|
||||
steps:
|
||||
@@ -120,9 +119,8 @@ jobs:
|
||||
# exists). See detect-changes for the full routing rationale.
|
||||
runs-on: docker-host
|
||||
# mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
env:
|
||||
# Unique name per run so concurrent jobs don't collide on the
|
||||
# bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
|
||||
@@ -243,8 +241,7 @@ jobs:
|
||||
# MUST exist for the integration tests to be meaningful. Hard-
|
||||
# fail if any didn't land — that would be a real regression we
|
||||
# want loud.
|
||||
# workspace_schedules added for the #2149 scheduler integration tests.
|
||||
for tbl in delegations workspaces activity_logs pending_uploads workspace_schedules; do
|
||||
for tbl in delegations workspaces activity_logs pending_uploads; do
|
||||
if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
|
||||
-c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
|
||||
| grep -q 1; then
|
||||
@@ -254,19 +251,6 @@ jobs:
|
||||
echo "✓ $tbl table present"
|
||||
done
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Preflight — INTEGRATION_DB_URL must be present
|
||||
run: |
|
||||
# Belt-and-suspenders: if the postgres-start step failed to
|
||||
# export INTEGRATION_DB_URL, fail loud BEFORE go test can
|
||||
# t.Skip its way to a green build. Closes the workflow-level
|
||||
# fail-open gap identified in PR #2166 blocker #2.
|
||||
if [ -z "${INTEGRATION_DB_URL:-}" ]; then
|
||||
echo "::error::INTEGRATION_DB_URL is empty — postgres-start step did not export the connection string"
|
||||
exit 1
|
||||
fi
|
||||
echo "INTEGRATION_DB_URL is set"
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Run integration tests
|
||||
run: |
|
||||
@@ -275,16 +259,6 @@ jobs:
|
||||
# workflow runs don't fight over a host-net 5432 port.
|
||||
go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Run scheduler integration tests (#2149)
|
||||
run: |
|
||||
# #2149: real-PG regression coverage for the scheduler firing loop
|
||||
# (tick → A2A fire → write-back of last_run_at/next_run_at/run_count/
|
||||
# activity_logs jsonb incl. invalid-UTF-8 sanitization + sweepPhantomBusy).
|
||||
# Reuses the same migrated Postgres (workspace_schedules / activity_logs
|
||||
# / workspaces all landed by the migration replay step above).
|
||||
go test -tags=integration -timeout 5m -v ./internal/scheduler/ -run "^TestIntegration_"
|
||||
|
||||
- if: failure() && needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Diagnostic dump on failure
|
||||
env:
|
||||
|
||||
@@ -49,56 +49,37 @@ jobs:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
steps:
|
||||
- name: Identify runner
|
||||
id: identify
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set -eu
|
||||
echo "arch=$(uname -m)"
|
||||
echo "kernel=$(uname -sr)"
|
||||
echo "shell=$BASH_VERSION"
|
||||
# Sanity: must actually be arm64. If amd64 sneaks in here,
|
||||
# the job skips gracefully rather than hard-failing, because
|
||||
# a mislabelled runner is an ops concern, not a code defect.
|
||||
# Pilot lane must not make main red (#2146).
|
||||
# fail fast — that means the label routing is wrong.
|
||||
case "$(uname -m)" in
|
||||
aarch64|arm64)
|
||||
echo "arm64 confirmed"
|
||||
echo "arm64=true" >> "$GITHUB_OUTPUT"
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: expected arm64, got $(uname -m) — label routing may be wrong"
|
||||
echo "arm64=false" >> "$GITHUB_OUTPUT"
|
||||
exit 1
|
||||
;;
|
||||
aarch64|arm64) echo "arm64 confirmed" ;;
|
||||
*) echo "ERROR: expected arm64, got $(uname -m)"; exit 1 ;;
|
||||
esac
|
||||
|
||||
- name: Checkout
|
||||
if: steps.identify.outputs.arm64 == 'true'
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install shellcheck (arm64)
|
||||
if: steps.identify.outputs.arm64 == 'true'
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set -eu
|
||||
if command -v shellcheck >/dev/null 2>&1; then
|
||||
echo "shellcheck already present: $(shellcheck --version | head -1)"
|
||||
else
|
||||
# Prefer apt if the runner base ships it; else download the
|
||||
# correct platform binary (darwin vs linux).
|
||||
# Prefer apt if the runner base ships it; else download arm64 binary.
|
||||
if command -v apt-get >/dev/null 2>&1; then
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y --no-install-recommends shellcheck
|
||||
else
|
||||
SC_VER=v0.10.0
|
||||
if [ "$(uname -s)" = "Darwin" ]; then
|
||||
SC_PKG="shellcheck-${SC_VER}.darwin.aarch64.tar.xz"
|
||||
else
|
||||
SC_PKG="shellcheck-${SC_VER}.linux.aarch64.tar.xz"
|
||||
fi
|
||||
curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/${SC_VER}/${SC_PKG}" \
|
||||
curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/${SC_VER}/shellcheck-${SC_VER}.linux.aarch64.tar.xz" \
|
||||
| tar -xJf - --strip-components=1
|
||||
sudo mv shellcheck /usr/local/bin/
|
||||
fi
|
||||
@@ -106,15 +87,14 @@ jobs:
|
||||
shellcheck --version | head -2
|
||||
|
||||
- name: Run shellcheck on .gitea/scripts/*.sh
|
||||
if: steps.identify.outputs.arm64 == 'true'
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set -eu
|
||||
# Only the scripts we control under .gitea/scripts. Pilot
|
||||
# scope is intentionally narrow — broaden in a follow-up
|
||||
# once the lane is proven.
|
||||
if ! command -v shellcheck >/dev/null 2>&1 || ! shellcheck --version >/dev/null 2>&1; then
|
||||
echo "WARN: shellcheck not functional — skipping (pilot mode)"
|
||||
if ! command -v shellcheck >/dev/null 2>&1; then
|
||||
echo "WARN: shellcheck binary not found — skipping (pilot mode)"
|
||||
exit 0
|
||||
fi
|
||||
# NOTE: macOS ships Bash 3.2 (Apple license), no `mapfile`
|
||||
|
||||
@@ -14,37 +14,10 @@ name: publish-canvas-image
|
||||
# authenticate to ghcr.io.
|
||||
#
|
||||
|
||||
# Builds, pushes, and (ordered) deploys the standalone canvas Docker image to
|
||||
# ECR whenever a commit lands on main that touches canvas code.
|
||||
#
|
||||
# Ordered deploy (core#2226) — mirrors publish-workspace-server-image.yml so the
|
||||
# standalone `molecule-ai/canvas` image is deterministic + verifiable, not a
|
||||
# side effect of the platform fleet pulling a mutable `:latest`:
|
||||
#
|
||||
# build-and-push: build → push :staging-<sha> + :staging-latest + :sha-<sha>
|
||||
# (does NOT move :latest — an unpromoted build must never
|
||||
# become the prod-blessed tag).
|
||||
# promote-canvas: waits for green main CI on this SHA, then re-points
|
||||
# :latest to the verified :staging-<sha> by digest
|
||||
# (imagetools create — no rebuild). So `:latest` == the
|
||||
# current prod-blessed canvas, byte-identical to staging-<sha>.
|
||||
#
|
||||
# Tag scheme produced (parallels platform-tenant):
|
||||
# :staging-<sha> — per-commit immutable digest, what docker-compose pins to.
|
||||
# :staging-latest — most recent BUILD on main (last-writer-wins, NOT gated).
|
||||
# :sha-<sha> — kept for back-compat with any consumer pinning the old tag.
|
||||
# :latest — most recent CI-GREEN build. Only moved by promote-canvas.
|
||||
#
|
||||
# WHY this is the canvas analogue of the platform's deploy-production, not a
|
||||
# literal copy: the standalone canvas co-deploys with the platform on the same
|
||||
# host via the root docker-compose.yml (`docker compose pull && up -d`). Gating
|
||||
# the canvas `:latest` promotion on the SAME green-main-CI signal the platform
|
||||
# deploy waits on makes platform + canvas roll together by the same SHA. The
|
||||
# canvas has no per-tenant fleet of its own and no /buildinfo endpoint, so there
|
||||
# is no fleet-rollout / per-tenant verify step to mirror here — CI-green +
|
||||
# digest-pin + immutable :staging-<sha> is the determinism contract. (A future
|
||||
# canvas /buildinfo would let this assert the served SHA like the platform does;
|
||||
# tracked in core#2226.)
|
||||
# Builds and pushes the canvas Docker image to ECR whenever a commit lands
|
||||
# on main that touches canvas code. Previously canvas changes were visible in
|
||||
# CI (npm run build passed) but the live container was never updated —
|
||||
# operators had to manually run `docker compose build canvas` each time.
|
||||
#
|
||||
# Mirror of publish-platform-image.yml, adapted for the Next.js canvas layer.
|
||||
# See that workflow for inline notes on macOS Keychain isolation and QEMU.
|
||||
@@ -57,7 +30,6 @@ on:
|
||||
# platform-only / docs-only / MCP-only merges.
|
||||
- 'canvas/**'
|
||||
- '.gitea/workflows/publish-canvas-image.yml'
|
||||
workflow_dispatch:
|
||||
# NOTE (Gitea port): the original GitHub workflow had a
|
||||
# `workflow_dispatch:` manual trigger for the
|
||||
# non-canvas-merge-but-need-fresh-image scenario. Dropped in the
|
||||
@@ -97,10 +69,6 @@ jobs:
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
# Exposed so promote-canvas re-points :latest to the EXACT per-commit tag
|
||||
# this build produced (digest-level), never a re-resolved mutable tag.
|
||||
staging_sha: ${{ steps.tags.outputs.staging_sha }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -172,7 +140,6 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
||||
echo "staging_sha=staging-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Resolve build args
|
||||
id: build_args
|
||||
@@ -208,19 +175,8 @@ jobs:
|
||||
build-args: |
|
||||
NEXT_PUBLIC_PLATFORM_URL=${{ steps.build_args.outputs.platform_url }}
|
||||
NEXT_PUBLIC_WS_URL=${{ steps.build_args.outputs.ws_url }}
|
||||
# Bake the merge SHA into the image so /api/buildinfo reports the
|
||||
# served canvas SHA (core#2235). Mirrors how the platform image
|
||||
# surfaces GIT_SHA at /buildinfo. Full 40-char SHA (not the
|
||||
# 7-char tag) so the fleet redeploy verification can match exactly.
|
||||
BUILD_SHA=${{ github.sha }}
|
||||
# Ordered deploy (core#2226): the build job pushes the immutable
|
||||
# per-commit tag + the build-tracking staging-latest + the legacy
|
||||
# back-compat :sha-<sha> tag. It does NOT push :latest — :latest is
|
||||
# the prod-blessed tag and is only re-pointed by promote-canvas after
|
||||
# green main CI, so an unpromoted/red build can never become :latest.
|
||||
tags: |
|
||||
${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.staging_sha }}
|
||||
${{ env.IMAGE_NAME }}:staging-latest
|
||||
${{ env.IMAGE_NAME }}:latest
|
||||
${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
|
||||
# Gitea artifact-cache reachability is best-effort on the operator
|
||||
# runner network. Do not let cache export fail an image that already
|
||||
@@ -229,107 +185,3 @@ jobs:
|
||||
org.opencontainers.image.source=https://git.moleculesai.app/${{ github.repository }}
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow)
|
||||
|
||||
# bp-exempt: post-merge canvas promote side-effect; merge is gated by CI /
|
||||
# all-required and this job waits for green push CI on the SHA before acting.
|
||||
promote-canvas:
|
||||
name: Promote canvas :latest to CI-green build
|
||||
needs: build-and-push
|
||||
# Only on a real main push — workflow_dispatch / non-main never promotes.
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
# Side-effect deploy only; the image publish above is the durable artifact.
|
||||
# mc#1982: do NOT renew this mask silently — it mirrors deploy-production's
|
||||
# contract (a flaky promote must not red the ship lane), tracked in core#2226.
|
||||
continue-on-error: true
|
||||
runs-on: publish
|
||||
timeout-minutes: 60
|
||||
env:
|
||||
# Same green-main-CI gate the platform deploy-production waits on, so
|
||||
# platform + canvas advance :latest off the identical signal/SHA.
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
GITEA_TOKEN: ${{ secrets.PROD_AUTO_DEPLOY_CONTROL_TOKEN || secrets.AUTO_SYNC_TOKEN }}
|
||||
CI_STATUS_TIMEOUT_SECONDS: "3600"
|
||||
# Re-uses the platform's disable kill-switch: when prod auto-deploy is
|
||||
# paused, the canvas :latest promote pauses too (correct — an unpromoted
|
||||
# build must not become :latest while the fleet is frozen).
|
||||
PROD_AUTO_DEPLOY_DISABLED: ${{ vars.PROD_AUTO_DEPLOY_DISABLED || secrets.PROD_AUTO_DEPLOY_DISABLED || '' }}
|
||||
steps:
|
||||
# The publish runner's default HOME (/home/hongming) is not writable, so
|
||||
# docker credential saves fail and halt the promote (#2193 on the platform
|
||||
# side). Point HOME + DOCKER_CONFIG at the writable job temp dir.
|
||||
- name: Prepare writable HOME + Docker config
|
||||
run: |
|
||||
set -euo pipefail
|
||||
H="$RUNNER_TEMP/canvas-promote-home"
|
||||
mkdir -p "$H/.docker"
|
||||
echo "HOME=$H" >> "$GITHUB_ENV"
|
||||
echo "DOCKER_CONFIG=$H/.docker" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Resolve promote gate
|
||||
id: gate
|
||||
env:
|
||||
PROD_AUTO_DEPLOY_DISABLED: ${{ env.PROD_AUTO_DEPLOY_DISABLED }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ -n "${PROD_AUTO_DEPLOY_DISABLED:-}" ]; then
|
||||
case "$(printf '%s' "$PROD_AUTO_DEPLOY_DISABLED" | tr '[:upper:]' '[:lower:]')" in
|
||||
1|true|yes|on|disabled|disable)
|
||||
echo "enabled=false" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Canvas :latest promote skipped: PROD_AUTO_DEPLOY_DISABLED=$PROD_AUTO_DEPLOY_DISABLED"
|
||||
{
|
||||
echo "## Canvas :latest promote skipped"
|
||||
echo ""
|
||||
echo "Reason: \`PROD_AUTO_DEPLOY_DISABLED=$PROD_AUTO_DEPLOY_DISABLED\`. The CI-green build is published as \`:staging-${GITHUB_SHA::7}\`; \`:latest\` was left unchanged."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 0 ;;
|
||||
esac
|
||||
fi
|
||||
if [ -z "${GITEA_TOKEN:-}" ]; then
|
||||
echo "::error::AUTO_SYNC_TOKEN/PROD_AUTO_DEPLOY_CONTROL_TOKEN is required so the canvas promote can wait for green CI."
|
||||
exit 1
|
||||
fi
|
||||
echo "enabled=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Wait for green main CI on this SHA
|
||||
if: ${{ steps.gate.outputs.enabled == 'true' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Same SSOT wait the platform deploy uses: blocks until the required
|
||||
# push contexts (CI / all-required (push) + Secret scan) go green on
|
||||
# THIS sha, and fails closed if any required context terminally fails.
|
||||
python3 .gitea/scripts/prod-auto-deploy.py wait-ci
|
||||
|
||||
- name: Promote canvas :latest to the CI-green image
|
||||
if: ${{ steps.gate.outputs.enabled == 'true' }}
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
STAGING_SHA_TAG: ${{ needs.build-and-push.outputs.staging_sha }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Fail-safe: if the build job's output didn't propagate, recompute the
|
||||
# immutable per-commit tag from the SHA so we never promote a guess.
|
||||
SHA_TAG="${STAGING_SHA_TAG:-staging-${GITHUB_SHA::7}}"
|
||||
ECR_REGISTRY="${IMAGE_NAME%%/*}"
|
||||
aws ecr get-login-password --region us-east-2 | \
|
||||
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
|
||||
|
||||
# Digest-level re-tag (no pull/rebuild): :latest becomes byte-identical
|
||||
# to the verified :staging-<sha> for this commit.
|
||||
docker buildx imagetools create \
|
||||
--tag "${IMAGE_NAME}:latest" \
|
||||
"${IMAGE_NAME}:${SHA_TAG}"
|
||||
|
||||
{
|
||||
echo "## Canvas :latest promoted"
|
||||
echo ""
|
||||
echo "Re-pointed \`molecule-ai/canvas:latest\` → \`${SHA_TAG}\` (by digest)."
|
||||
echo ":latest now tracks the CI-green canvas build for commit \`${GITHUB_SHA::7}\`."
|
||||
echo ""
|
||||
echo "Tenants/hosts that \`docker compose pull canvas\` now get the same build the platform deploy rolled for this SHA."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
@@ -16,24 +16,14 @@ name: publish-workspace-server-image
|
||||
#
|
||||
# Image tags produced:
|
||||
# :staging-<sha> — per-commit digest, stable for canary verify
|
||||
# :staging-latest — tracks most recent BUILD on this branch (set by the
|
||||
# build job, last-writer-wins, NOT prod-gated)
|
||||
# :latest — tracks the most recent PROD-PROMOTED build. Re-pointed by the
|
||||
# deploy-production job ONLY after green main CI + canary +
|
||||
# fleet rollout + /buildinfo verification pass. So :latest ==
|
||||
# "current prod image", never the raw build. (Added 2026-06-03
|
||||
# after a stale :latest — last moved 2026-05-10 — reverted a
|
||||
# production tenant on a no-arg redeploy.)
|
||||
# :staging-latest — tracks most recent build on this branch
|
||||
#
|
||||
# Production auto-deploy:
|
||||
# After both platform and tenant images are pushed, deploy-production waits
|
||||
# for strict required push contexts on the same SHA to go green, then
|
||||
# calls the production CP redeploy-fleet endpoint with target_tag=
|
||||
# staging-<sha>. On success (rollout + buildinfo verified) it re-points
|
||||
# :latest to the same SHA. Set repo variable or secret
|
||||
# PROD_AUTO_DEPLOY_DISABLED=true to stop production rollout while keeping
|
||||
# image publishing enabled — in which case :latest is NOT advanced either
|
||||
# (correct: an unpromoted build must not become :latest).
|
||||
# staging-<sha>. Set repo variable or secret PROD_AUTO_DEPLOY_DISABLED=true
|
||||
# to stop production rollout while keeping image publishing enabled.
|
||||
#
|
||||
# Primary ECR target: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*
|
||||
# Optional staging tenant mirror target:
|
||||
@@ -115,19 +105,6 @@ jobs:
|
||||
echo "Docker daemon OK"
|
||||
echo "::endgroup::"
|
||||
|
||||
# Pre-flight: verify every repo in manifest.json actually exists.
|
||||
#
|
||||
# Why: deleting a template repo without updating manifest.json breaks
|
||||
# clone-manifest.sh with a generic git 404, which looks like a
|
||||
# transient network error and wastes debug time. We catch it here
|
||||
# with a per-entry ::error:: annotation naming the missing repo
|
||||
# (issue #2192). This is the push-time complement to PR #2186's
|
||||
# PR-time manifest-entry-existence gate.
|
||||
- name: Validate manifest entries exist
|
||||
run: |
|
||||
set -euo pipefail
|
||||
bash scripts/check-manifest-repos-exist.sh manifest.json
|
||||
|
||||
# Pre-clone manifest deps before docker build.
|
||||
#
|
||||
# Why: workspace-template-* repos on Gitea are private. The pre-fix
|
||||
@@ -275,25 +252,7 @@ jobs:
|
||||
PROD_AUTO_DEPLOY_BATCH_SIZE: ${{ vars.PROD_AUTO_DEPLOY_BATCH_SIZE || '3' }}
|
||||
PROD_AUTO_DEPLOY_DRY_RUN: ${{ vars.PROD_AUTO_DEPLOY_DRY_RUN || '' }}
|
||||
PROD_ALLOW_NON_PROD_CP_URL: ${{ vars.PROD_ALLOW_NON_PROD_CP_URL || '' }}
|
||||
# #2213: per-tenant /buildinfo settle budget. A freshly-swapped tenant can
|
||||
# keep serving the old image at the edge for a short drain window; the
|
||||
# verify step polls each tenant up to this budget before declaring it stale.
|
||||
PROD_AUTO_DEPLOY_VERIFY_BUDGET_SECONDS: ${{ vars.PROD_AUTO_DEPLOY_VERIFY_BUDGET_SECONDS || '240' }}
|
||||
PROD_AUTO_DEPLOY_VERIFY_INTERVAL_SECONDS: ${{ vars.PROD_AUTO_DEPLOY_VERIFY_INTERVAL_SECONDS || '20' }}
|
||||
steps:
|
||||
# The publish runner's default HOME (/home/hongming) is not writable, so
|
||||
# git/docker credential saves fail (`Error saving credentials: mkdir
|
||||
# /home/hongming: permission denied`) and halt the production rollout
|
||||
# (#2193). Point HOME + DOCKER_CONFIG at the writable job temp dir —
|
||||
# mirrors build-and-push's "Prepare writable Docker config" fix above.
|
||||
- name: Prepare writable HOME + Docker config
|
||||
run: |
|
||||
set -euo pipefail
|
||||
H="$RUNNER_TEMP/auto-deploy-home"
|
||||
mkdir -p "$H/.docker"
|
||||
echo "HOME=$H" >> "$GITHUB_ENV"
|
||||
echo "DOCKER_CONFIG=$H/.docker" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -338,50 +297,8 @@ jobs:
|
||||
set -euo pipefail
|
||||
python3 .gitea/scripts/prod-auto-deploy.py wait-ci
|
||||
|
||||
# Superseded-job guard — BEFORE any production side effect (#2213).
|
||||
#
|
||||
# This workflow has no `concurrency:` (see header: Gitea 1.22.6 cancels
|
||||
# queued prod deploys). So two close main pushes run BOTH deploy-production
|
||||
# jobs. The verify step already skips its strict /buildinfo check when this
|
||||
# job is superseded (#2194) — but that guard was AFTER the redeploy and the
|
||||
# :latest promote, so an OLDER job that started late still:
|
||||
# 1. rolled the whole fleet BACKWARD to its older tag (canary hongming
|
||||
# was reverted from the newer SHA — the #2213 red), then
|
||||
# 2. promoted :latest backward to the older image,
|
||||
# and only THEN skipped verify and exited green. A superseded job must do
|
||||
# NEITHER. We re-check the branch head here, immediately before the rollout,
|
||||
# and skip every side effect when a newer commit already owns main.
|
||||
#
|
||||
# exit 0 + non-empty stdout => superseded (newer head printed); the redeploy
|
||||
# and promote steps are gated off via this output. exit 10 => this job is
|
||||
# still the latest, proceed to roll the fleet. Fail-safe: a head that can't
|
||||
# be read returns NOT-superseded (exit 10), so a genuine deploy is never
|
||||
# silently skipped. (Re-checked again at verify time to catch a newer job
|
||||
# that lands DURING this rollout.)
|
||||
- name: Check superseded before production side effects
|
||||
id: supersede
|
||||
if: ${{ steps.plan.outputs.enabled == 'true' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
set +e
|
||||
NEWER_HEAD="$(python3 .gitea/scripts/prod-auto-deploy.py check-superseded)"
|
||||
SUPERSEDED_EXIT=$?
|
||||
set -e
|
||||
if [ "$SUPERSEDED_EXIT" -eq 0 ] && [ -n "$NEWER_HEAD" ]; then
|
||||
echo "superseded=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Superseded before rollout: main head is now ${NEWER_HEAD:0:7} (this job deploys ${GITHUB_SHA:0:7}). Skipping redeploy + :latest promote so an older job never rolls the fleet backward."
|
||||
{
|
||||
echo "## Production auto-deploy skipped — superseded before rollout"
|
||||
echo ""
|
||||
echo "This deploy job's SHA \`${GITHUB_SHA:0:7}\` is no longer the head of \`main\` (now \`${NEWER_HEAD:0:7}\`)."
|
||||
echo "A newer deploy job owns the fleet; rolling it backward to this older build would revert tenants and \`:latest\`. No side effects performed."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
else
|
||||
echo "superseded=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Call production CP redeploy-fleet
|
||||
if: ${{ steps.plan.outputs.enabled == 'true' && steps.supersede.outputs.superseded != 'true' }}
|
||||
if: ${{ steps.plan.outputs.enabled == 'true' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
python3 .gitea/scripts/prod-auto-deploy.py assert-enabled
|
||||
@@ -440,66 +357,18 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Verify reachable tenants report this SHA
|
||||
# Skip when superseded BEFORE rollout: the redeploy step did not run, so
|
||||
# there is no redeploy-fleet response to verify against and the newer job
|
||||
# owns verification (#2213). The in-step guard below still catches the
|
||||
# case where a newer job lands DURING this job's rollout.
|
||||
if: ${{ steps.plan.outputs.enabled == 'true' && steps.supersede.outputs.superseded != 'true' }}
|
||||
if: ${{ steps.plan.outputs.enabled == 'true' }}
|
||||
env:
|
||||
TENANT_DOMAIN: moleculesai.app
|
||||
run: |
|
||||
set -euo pipefail
|
||||
RESP="$RUNNER_TEMP/prod-redeploy-response.json"
|
||||
|
||||
# Superseded-job guard. This workflow has no `concurrency:` (header
|
||||
# explains why: Gitea 1.22.6 cancels queued prod deploys). So two
|
||||
# close main pushes run BOTH deploy-production jobs. The newer one
|
||||
# rolls the fleet to its (newer) build first; this older job's strict
|
||||
# equality check below would then see tenants on the NEWER SHA and
|
||||
# false-red "$slug is stale" even though the fleet is AHEAD, not
|
||||
# behind (git SHAs aren't ordered; /buildinfo exposes only git_sha).
|
||||
#
|
||||
# If main's current head is no longer THIS job's SHA, a newer commit
|
||||
# has landed and this deploy is superseded — the newest job's verify
|
||||
# is authoritative. Skip strict verify and succeed. exit 0 => newer
|
||||
# head printed (superseded); exit 10 => still the latest, proceed to
|
||||
# the strict verify so a genuinely-behind tenant still fails loudly.
|
||||
set +e
|
||||
NEWER_HEAD="$(python3 .gitea/scripts/prod-auto-deploy.py check-superseded)"
|
||||
SUPERSEDED_EXIT=$?
|
||||
set -e
|
||||
if [ "$SUPERSEDED_EXIT" -eq 0 ] && [ -n "$NEWER_HEAD" ]; then
|
||||
echo "::notice::Superseded deploy: main head is now ${NEWER_HEAD:0:7} (this job deployed ${GITHUB_SHA:0:7}). The fleet is at or ahead of this build; the newer deploy job's verify is authoritative. Skipping strict SHA verify."
|
||||
{
|
||||
echo ""
|
||||
echo "### Buildinfo verification skipped — superseded deploy"
|
||||
echo ""
|
||||
echo "This deploy job's SHA \`${GITHUB_SHA:0:7}\` is no longer the head of \`main\` (now \`${NEWER_HEAD:0:7}\`)."
|
||||
echo "A newer deploy job is rolling the fleet forward; its verify is authoritative."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mapfile -t SLUGS < <(jq -r '.results[]? | .slug' "$RESP")
|
||||
if [ ${#SLUGS[@]} -eq 0 ]; then
|
||||
echo "::error::No tenants returned from redeploy-fleet; refusing to mark production deploy verified."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Per-tenant settle/retry budget (#2213). A tenant whose container the
|
||||
# CP just swapped can keep serving the OLD image at the edge for a short
|
||||
# window while the old container drains — /buildinfo returns HTTP 200
|
||||
# with the previous SHA, which `curl --retry` does NOT retry (it only
|
||||
# retries connection/5xx failures, not a stale-but-200 body). Without a
|
||||
# settle window a still-rolling tenant false-reds "stale" on the very
|
||||
# first poll. So poll each tenant's /buildinfo until it reports the
|
||||
# target SHA or the budget is exhausted; only THEN declare it stale or
|
||||
# unreachable. This never masks a genuinely stuck tenant — a tenant that
|
||||
# never reaches the target within the budget still fails loud (and the
|
||||
# superseded-job revert class is already blocked before rollout above).
|
||||
SETTLE_BUDGET_SECONDS="${PROD_AUTO_DEPLOY_VERIFY_BUDGET_SECONDS:-240}"
|
||||
SETTLE_INTERVAL_SECONDS="${PROD_AUTO_DEPLOY_VERIFY_INTERVAL_SECONDS:-20}"
|
||||
|
||||
STALE_COUNT=0
|
||||
UNREACHABLE_COUNT=0
|
||||
UNHEALTHY_COUNT=0
|
||||
@@ -511,36 +380,18 @@ jobs:
|
||||
continue
|
||||
fi
|
||||
url="https://${slug}.${TENANT_DOMAIN}/buildinfo"
|
||||
deadline=$(( $(date +%s) + SETTLE_BUDGET_SECONDS ))
|
||||
actual=""
|
||||
last_actual=""
|
||||
on_target=false
|
||||
while :; do
|
||||
body="$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$url" || true)"
|
||||
actual="$(echo "$body" | jq -r '.git_sha // ""' 2>/dev/null || echo "")"
|
||||
[ -n "$actual" ] && last_actual="$actual"
|
||||
if [ "$actual" = "$GITHUB_SHA" ]; then
|
||||
on_target=true
|
||||
break
|
||||
fi
|
||||
now=$(date +%s)
|
||||
if [ "$now" -ge "$deadline" ]; then
|
||||
break
|
||||
fi
|
||||
# Still rolling (stale 200) or transiently unreachable — wait and
|
||||
# re-poll within the settle budget rather than failing on first read.
|
||||
remaining=$(( deadline - now ))
|
||||
echo "$slug: waiting for target SHA (have '${actual:0:7}', want ${GITHUB_SHA:0:7}; ${remaining}s left)"
|
||||
sleep "$SETTLE_INTERVAL_SECONDS"
|
||||
done
|
||||
if [ "$on_target" = true ]; then
|
||||
echo "$slug: ${actual:0:7}"
|
||||
elif [ -z "$last_actual" ]; then
|
||||
echo "::error::$slug did not return /buildinfo after deploy (waited ${SETTLE_BUDGET_SECONDS}s)."
|
||||
body="$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$url" || true)"
|
||||
actual="$(echo "$body" | jq -r '.git_sha // ""' 2>/dev/null || echo "")"
|
||||
if [ -z "$actual" ]; then
|
||||
echo "::error::$slug did not return /buildinfo after deploy."
|
||||
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
||||
else
|
||||
echo "::error::$slug is stale: actual=${last_actual:0:7}, expected=${GITHUB_SHA:0:7} (waited ${SETTLE_BUDGET_SECONDS}s)"
|
||||
continue
|
||||
fi
|
||||
if [ "$actual" != "$GITHUB_SHA" ]; then
|
||||
echo "::error::$slug is stale: actual=${actual:0:7}, expected=${GITHUB_SHA:0:7}"
|
||||
STALE_COUNT=$((STALE_COUNT + 1))
|
||||
else
|
||||
echo "$slug: ${actual:0:7}"
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -558,69 +409,3 @@ jobs:
|
||||
if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Re-point :latest to the just-promoted image — ONLY after the
|
||||
# production rollout + buildinfo verification above have passed.
|
||||
#
|
||||
# WHY HERE (promote point), not at build time:
|
||||
# The platform-tenant ECR `:latest` tag was last moved 2026-05-10
|
||||
# and went 3.5 weeks stale because the build step only pushes
|
||||
# :staging-<sha> + :staging-latest and never re-points :latest. A
|
||||
# no-arg POST /cp/admin/tenants/:slug/redeploy (whose default tag
|
||||
# fell through to "latest") then pulled the 3.5-week-old image and
|
||||
# REVERTED the tenant (incident: molecule-adk-demo, 2026-06-03).
|
||||
#
|
||||
# The defense-in-depth half of this fix changes that redeploy
|
||||
# default to :staging-latest, but :latest itself must also be
|
||||
# kept meaningful. We make :latest track the PROD-BLESSED build,
|
||||
# not the raw build: by living at the end of deploy-production —
|
||||
# after `wait-ci` (green main CI), the canary-first batched fleet
|
||||
# rollout, AND the /buildinfo SHA verification — :latest only ever
|
||||
# advances to a SHA that is actually green and confirmed running
|
||||
# across the live fleet. So `:latest` == "current prod image",
|
||||
# and any consumer that pulls :latest (legacy callers, manual
|
||||
# `docker pull`, a redeploy that somehow still resolves "latest")
|
||||
# gets the blessed image instead of whatever happened to build.
|
||||
#
|
||||
# Re-tag is digest-level (imagetools create), so no rebuild and
|
||||
# :latest is byte-identical to :staging-<sha> for this commit.
|
||||
# Gate on supersede: a superseded older job must NOT move :latest backward
|
||||
# to its older image (#2213 — 275383 promoted :latest → the older
|
||||
# staging-7a72516 after a newer job had already shipped). :latest must only
|
||||
# ever advance under the job that owns main's head.
|
||||
- name: Promote :latest to the verified prod image
|
||||
if: ${{ steps.plan.outputs.enabled == 'true' && steps.supersede.outputs.superseded != 'true' }}
|
||||
env:
|
||||
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
|
||||
STAGING_TENANT_IMAGE_NAME: ${{ env.STAGING_TENANT_IMAGE_NAME }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
run: |
|
||||
set -euo pipefail
|
||||
SHA_TAG="staging-${GITHUB_SHA::7}"
|
||||
PROD_ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
|
||||
STAGING_ECR_REGISTRY="${STAGING_TENANT_IMAGE_NAME%%/*}"
|
||||
|
||||
aws ecr get-login-password --region us-east-2 | \
|
||||
docker login --username AWS --password-stdin "${PROD_ECR_REGISTRY}"
|
||||
aws ecr get-login-password --region us-east-2 | \
|
||||
docker login --username AWS --password-stdin "${STAGING_ECR_REGISTRY}"
|
||||
|
||||
# imagetools create copies the source manifest to the new tag by
|
||||
# digest (no pull/rebuild). :latest now points at the exact image
|
||||
# that just passed the prod gate.
|
||||
docker buildx imagetools create \
|
||||
--tag "${TENANT_IMAGE_NAME}:latest" \
|
||||
"${TENANT_IMAGE_NAME}:${SHA_TAG}"
|
||||
docker buildx imagetools create \
|
||||
--tag "${STAGING_TENANT_IMAGE_NAME}:latest" \
|
||||
"${STAGING_TENANT_IMAGE_NAME}:${SHA_TAG}"
|
||||
|
||||
{
|
||||
echo ""
|
||||
echo "### :latest promoted"
|
||||
echo ""
|
||||
echo "Re-pointed \`platform-tenant:latest\` → \`${SHA_TAG}\` (prod + staging ECR)."
|
||||
echo ":latest now tracks the prod-blessed, fleet-verified image."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
@@ -33,20 +33,11 @@
|
||||
# 2026-05-17 (internal#189 Phase 1).
|
||||
#
|
||||
# BURN-IN CLOSED 2026-05-17 (internal#189 Phase 1): The 7-day burn-in
|
||||
# window closed. As of 2026-06-04 the residual masks left behind by the
|
||||
# burn-in are removed for real (the comment previously claimed this while
|
||||
# the masks still persisted — that was stale):
|
||||
# - continue-on-error: true on the jq-install step (redundant; the step
|
||||
# already exits 0) and on the tier-check step (the burn-in mask).
|
||||
# - the `|| true` after the sop-tier-check.sh invocation, which masked
|
||||
# real tier-gate verdicts.
|
||||
# AND-composition is now fully enforced and the tier-check step can
|
||||
# honestly red CI on a real SOP-6 violation. SOP_FAIL_OPEN=1 is RETAINED
|
||||
# as sanctioned infra-resilience: it fails-open only on token/network/jq
|
||||
# faults, never on a real gate verdict. If you need to temporarily
|
||||
# re-introduce a mask, file a tracker and follow the mc#1982 protocol
|
||||
# (Tier 2e lint requires a current tracker within 2 lines of any
|
||||
# continue-on-error: true).
|
||||
# window closed. continue-on-error: true has been removed from the
|
||||
# tier-check job; AND-composition is now fully enforced. If you need
|
||||
# to temporarily re-introduce a mask, file a tracker and follow the
|
||||
# mc#1982 protocol (Tier 2e lint requires a current tracker within
|
||||
# 2 lines of any continue-on-error: true).
|
||||
|
||||
name: sop-tier-check
|
||||
|
||||
@@ -99,11 +90,10 @@ jobs:
|
||||
# GitHub releases may be unreachable from some runner networks
|
||||
# (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
|
||||
# runners). The sop-tier-check script has its own fallback as a
|
||||
# third line of defense, and this step's final command
|
||||
# (`jq --version ... || echo`) already exits 0 unconditionally — so
|
||||
# the step cannot fail the job on its own.
|
||||
# continue-on-error REMOVED 2026-06-04 (mc#1982 directive: root-fix
|
||||
# and remove, do not renew). It was redundant masking, not a gate.
|
||||
# third line of defense. continue-on-error: true ensures this step
|
||||
# failing does not block the job.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
run: |
|
||||
# apt-get is the primary method — Ubuntu package mirrors are reliably
|
||||
# reachable from runner containers. GitHub releases may be blocked
|
||||
@@ -120,11 +110,11 @@ jobs:
|
||||
jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"
|
||||
|
||||
- name: Verify tier label + reviewer team membership
|
||||
# continue-on-error REMOVED 2026-06-04 (expired internal#189 Phase 1
|
||||
# burn-in, window closed 2026-05-17; mc#1982 directive: root-fix and
|
||||
# remove, do not renew). SOP_FAIL_OPEN=1 below still fails-open on
|
||||
# token/network/infra errors only (never on a real tier-gate verdict),
|
||||
# so this step can now honestly fail CI on a genuine SOP-6 violation.
|
||||
# continue-on-error: true at step level — job-level is ignored by Gitea
|
||||
# Actions (quirk #10, internal runbooks). Belt-and-suspenders with
|
||||
# SOP_FAIL_OPEN=1 + || true below.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
@@ -133,13 +123,9 @@ jobs:
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
SOP_DEBUG: '0'
|
||||
SOP_LEGACY_CHECK: '0'
|
||||
# SOP_FAIL_OPEN=1 fails-open ONLY on infra faults (empty/invalid
|
||||
# token, unreachable Gitea API, missing jq) — see the guarded
|
||||
# `exit 0` branches in sop-tier-check.sh. It does NOT mask a real
|
||||
# tier-gate verdict: a missing tier label, no approving review, or
|
||||
# an unsatisfied AND-clause still `exit 1`. Kept as sanctioned
|
||||
# infra-resilience; the `|| true` mask was REMOVED with the burn-in
|
||||
# COE (2026-06-04) so a genuine SOP-6 violation now reds CI.
|
||||
# SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
|
||||
# the actual merge gate. Combined with continue-on-error: true
|
||||
# above, this step never fails the job regardless of script exit.
|
||||
SOP_FAIL_OPEN: '1'
|
||||
run: |
|
||||
bash .gitea/scripts/sop-tier-check.sh
|
||||
bash .gitea/scripts/sop-tier-check.sh || true
|
||||
|
||||
@@ -60,7 +60,6 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
# bp-required: pending #718 — soak-then-promote, not in BP yet.
|
||||
compare:
|
||||
name: Compare synced providers.yaml against controlplane canonical
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -67,7 +67,6 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
# bp-required: pending #718 — soak-then-promote, not in BP yet.
|
||||
verify:
|
||||
name: Regenerate providers artifact and fail on drift
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -24,17 +24,6 @@ COPY --from=builder /app/public ./public
|
||||
EXPOSE 3000
|
||||
ENV PORT=3000
|
||||
ENV HOSTNAME="0.0.0.0"
|
||||
# Git SHA the image was built from, surfaced at /api/buildinfo so canvas
|
||||
# deploys are verifiable by the served SHA the same way workspace-server's
|
||||
# /buildinfo is (core#2235). Wired from `${{ github.sha }}` in
|
||||
# publish-canvas-image.yml. Server-only (not NEXT_PUBLIC_) — the route
|
||||
# handler reads it at runtime on the standalone Node server, so it stays
|
||||
# out of the client bundle. Set on the final stage (not the builder) so it
|
||||
# lives in the runtime env that force-dynamic reads per request. Default
|
||||
# "dev" matches the route + workspace-server sentinel: an unwired build
|
||||
# fails the SHA comparison closed instead of looking deployed.
|
||||
ARG BUILD_SHA=dev
|
||||
ENV BUILD_SHA=$BUILD_SHA
|
||||
# Non-root runtime — use addgroup/adduser without fixed GID/UID to avoid conflicts with base image
|
||||
RUN addgroup canvas 2>/dev/null || true && adduser -G canvas -s /bin/sh -D canvas 2>/dev/null || true
|
||||
USER canvas
|
||||
|
||||
@@ -101,19 +101,10 @@ test.describe("Desktop ChatTab", () => {
|
||||
await textarea.fill("Trigger activity");
|
||||
await page.getByRole("button", { name: /Send/ }).first().click();
|
||||
|
||||
// FALSE-GREEN FIX: the prior `.catch(() => {})` swallowed the assertion
|
||||
// entirely, so this test passed whether or not the activity log ever
|
||||
// rendered. The activity-log container is optional per layout, so we
|
||||
// gate on its presence in the DOM: if it's not part of this layout,
|
||||
// skip explicitly (a recorded skip, not a silent pass); if it IS
|
||||
// present, it MUST become visible during the send flow — that's the
|
||||
// behaviour this test exists to protect.
|
||||
const activityLog = page.locator("[data-testid='activity-log']").first();
|
||||
if ((await activityLog.count()) === 0) {
|
||||
test.skip(true, "activity-log not part of this layout");
|
||||
return;
|
||||
}
|
||||
await expect(activityLog).toBeVisible({ timeout: 10_000 });
|
||||
// Activity log container should appear during the send flow.
|
||||
await expect(page.locator("[data-testid='activity-log']").first()).toBeVisible({ timeout: 10_000 }).catch(() => {
|
||||
// Activity log may not be present in all layouts.
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -60,26 +60,11 @@ test.describe("MobileChat", () => {
|
||||
|
||||
await expect(page.getByText("Echo: Mobile persistence")).toBeVisible({ timeout: 15_000 });
|
||||
|
||||
// Reload and deterministically wait for the chat-history GET that
|
||||
// rehydrates the transcript to come back 2xx, rather than racing a
|
||||
// fixed-timeout render assertion against an in-flight fetch. The
|
||||
// server now persists the a2a_receive row SYNCHRONOUSLY before the
|
||||
// send's 200 (workspace-server logA2ASuccess), so the row is
|
||||
// guaranteed present by the time this GET runs — the wait is for
|
||||
// hydration latency, not for a still-racing write.
|
||||
const historyResponse = page.waitForResponse(
|
||||
(resp) =>
|
||||
resp.url().includes("/chat-history") &&
|
||||
resp.request().method() === "GET" &&
|
||||
resp.status() === 200,
|
||||
{ timeout: 15_000 },
|
||||
);
|
||||
await page.reload();
|
||||
await page.waitForSelector("[data-testid='chat-panel']", { timeout: 10_000 });
|
||||
await historyResponse;
|
||||
|
||||
await expect(page.getByText("Mobile persistence", { exact: true })).toBeVisible();
|
||||
await expect(page.getByText("Echo: Mobile persistence")).toBeVisible();
|
||||
await expect(page.getByText("Mobile persistence", { exact: true })).toBeVisible({ timeout: 5_000 });
|
||||
await expect(page.getByText("Echo: Mobile persistence")).toBeVisible({ timeout: 5_000 });
|
||||
});
|
||||
|
||||
test("composer auto-grows with multi-line text", async ({ page }) => {
|
||||
|
||||
@@ -1,329 +0,0 @@
|
||||
/**
|
||||
* Staging canvas E2E — REAL desktop take-control path (core#2261 "Gap 1").
|
||||
*
|
||||
* This is the live-e2e gate that the existing staging-tabs.spec.ts does NOT
|
||||
* provide. staging-tabs only opens the 13 declared workspace-panel tabs
|
||||
* (TAB_IDS at staging-tabs.spec.ts:24-38 — `display` is NOT among them) and
|
||||
* asserts they render without a "Failed to load" toast. It never acquires
|
||||
* display control, never opens the noVNC WebSocket, and never asserts a
|
||||
* framebuffer frame arrives. The companion unit test
|
||||
* canvas/src/components/tabs/__tests__/DisplayTab.test.tsx mocks the RFB
|
||||
* constructor (vi.mock("@novnc/novnc"), see its lines 8/20-39) so NO real
|
||||
* WebSocket is ever opened there either. Result: a broken take-control path
|
||||
* (acquire → noVNC WS upgrade → ws-proxy → EIC → websockify → x11vnc → Xvfb)
|
||||
* ships GREEN. This spec closes that gap by exercising the REAL wire path
|
||||
* end to end against a live, desktop-capable staging workspace.
|
||||
*
|
||||
* What it asserts (the real path, no mocks):
|
||||
* 1. POST /workspaces/<id>/display/control/acquire returns 200 with a
|
||||
* session_url that carries the signed token in its `#token=` fragment
|
||||
* (mirrors workspace_display_control.go:signedDisplaySessionURL).
|
||||
* 2. Opening the noVNC WebSocket at session_url with the subprotocols
|
||||
* ["binary", "molecule-display-token.<token>"] (exactly what the canvas
|
||||
* sends — DisplayTab.tsx:339) UPGRADES (onopen fires, readyState===OPEN,
|
||||
* no immediate 1006 abnormal close). A 1006 / 403 means the handshake
|
||||
* failed somewhere in the proxy chain.
|
||||
* 3. At least one BINARY framebuffer message arrives on that socket — a
|
||||
* real frame off x11vnc, not just a panel mount. RFB sends a
|
||||
* ProtocolVersion banner ("RFB 003.00x\n") as the first server message,
|
||||
* which proves the upstream VNC server is live behind the EIC tunnel.
|
||||
*
|
||||
* Auth model (important): the WS upgrade is gated by workspace-server
|
||||
* middleware.AdminAuth. A browser WebSocket CANNOT set an Authorization
|
||||
* header, so in production the canvas WS upgrade passes AdminAuth via the
|
||||
* same-origin-canvas path (wsauth_middleware.go:isSameOriginCanvas, which
|
||||
* keys off the Origin header the browser sets automatically on a same-origin
|
||||
* WS upgrade). We therefore open the socket from inside the browser page via
|
||||
* page.evaluate AFTER navigating to the tenant origin — so the browser sends
|
||||
* `Origin: https://<slug>.staging.moleculesai.app`, exactly as production
|
||||
* does. The acquire POST (which CAN carry a header) uses the per-tenant admin
|
||||
* bearer set on the context. This is the faithful production handshake, not a
|
||||
* synthetic one.
|
||||
*
|
||||
* Gate / cost: this test only runs when STAGING_DISPLAY_WORKSPACE_ID points
|
||||
* at a STANDING desktop-capable workspace (compute.display.mode ==
|
||||
* "desktop-control"). We deliberately do NOT provision one in the shared
|
||||
* staging-setup.ts: a desktop AMI boots in ~12-15 min and would tax the
|
||||
* existing tabs harness on every run. Standing that workspace up is a cost
|
||||
* item for the CTO (one always-on desktop EC2 on staging). Until that exists,
|
||||
* the test SKIPS loud. When the env IS present, any failure in
|
||||
* provision/acquire/upgrade is a HARD error — fail-closed, never silently
|
||||
* green (no "flaky" disposition: a 1006 names a broken proxy hop).
|
||||
*/
|
||||
|
||||
import { test, expect } from "@playwright/test";
|
||||
|
||||
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
|
||||
|
||||
// The standing desktop-capable workspace id. Absent => skip loud. This is
|
||||
// the single knob that activates the gate; see file header for the cost note.
|
||||
const DISPLAY_WS_ID = process.env.STAGING_DISPLAY_WORKSPACE_ID;
|
||||
|
||||
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
|
||||
test.skip(
|
||||
!DISPLAY_WS_ID,
|
||||
"STAGING_DISPLAY_WORKSPACE_ID not set — no standing desktop-capable staging " +
|
||||
"workspace to exercise the take-control path. Set it to a workspace whose " +
|
||||
"compute.display.mode == 'desktop-control' to activate this real-e2e gate. " +
|
||||
"(Standing that workspace up is a CTO cost item — one always-on desktop EC2.)",
|
||||
);
|
||||
|
||||
// How long we wait for the WS to upgrade + deliver the first frame. The EIC
|
||||
// tunnel + websockify handshake adds real latency on top of the edge; budget
|
||||
// generously but bounded, so a genuinely-dead path fails LOUD instead of
|
||||
// hanging to the suite timeout.
|
||||
const WS_UPGRADE_TIMEOUT_MS = 30_000;
|
||||
const FIRST_FRAME_TIMEOUT_MS = 30_000;
|
||||
|
||||
test.describe("staging desktop take-control (real noVNC path)", () => {
|
||||
test("acquire → WS upgrades → first framebuffer frame arrives", async ({
|
||||
page,
|
||||
context,
|
||||
}) => {
|
||||
// The standing desktop workspace lives in its OWN standing org (it can't
|
||||
// live in the per-run ephemeral org — that gets torn down each run). When
|
||||
// STAGING_DISPLAY_SLUG is configured, staging-setup.ts resolves that org's
|
||||
// tenant URL / admin token / org id and exports them under STAGING_DISPLAY_*.
|
||||
// Fall back to the ephemeral org's exports only if the display org wasn't
|
||||
// separately configured (e.g. the desktop workspace happens to live in the
|
||||
// run's own tenant — not the expected topology, but supported).
|
||||
const tenantURL =
|
||||
process.env.STAGING_DISPLAY_TENANT_URL || process.env.STAGING_TENANT_URL;
|
||||
const tenantToken =
|
||||
process.env.STAGING_DISPLAY_TENANT_TOKEN || process.env.STAGING_TENANT_TOKEN;
|
||||
const orgID =
|
||||
process.env.STAGING_DISPLAY_ORG_ID || process.env.STAGING_ORG_ID;
|
||||
|
||||
// Fail-closed: when the gate env IS present (we got past the skips above),
|
||||
// the rest of the staging context MUST be wired or this is a hard error,
|
||||
// never a silent pass. Mirrors staging-tabs.spec.ts:53-57.
|
||||
if (!tenantURL || !tenantToken) {
|
||||
throw new Error(
|
||||
"STAGING_DISPLAY_WORKSPACE_ID is set but no tenant URL/token is available " +
|
||||
"for the take-control gate. Set STAGING_DISPLAY_SLUG so staging-setup.ts " +
|
||||
"resolves STAGING_DISPLAY_TENANT_URL / STAGING_DISPLAY_TENANT_TOKEN for the " +
|
||||
"standing desktop org (or ensure the ephemeral STAGING_TENANT_* exports exist).",
|
||||
);
|
||||
}
|
||||
|
||||
const workspaceId = DISPLAY_WS_ID as string;
|
||||
|
||||
// The per-tenant admin bearer satisfies AdminAuth for the acquire POST
|
||||
// (which can carry a header). The WS upgrade below relies on Origin
|
||||
// (same-origin canvas), NOT this header.
|
||||
await context.setExtraHTTPHeaders({
|
||||
Authorization: `Bearer ${tenantToken}`,
|
||||
// X-Molecule-Org-Id is required by workspace-server TenantGuard for
|
||||
// cross-org requests routed through the CP edge; staging-setup exports it.
|
||||
// Harmless (and correct) to send on the same-origin tenant box too.
|
||||
...(orgID ? { "X-Molecule-Org-Id": orgID } : {}),
|
||||
});
|
||||
|
||||
// 0. Sanity: the workspace must actually be display-enabled, else the
|
||||
// whole gate is meaningless. Hit the availability endpoint first so a
|
||||
// mis-pointed STAGING_DISPLAY_WORKSPACE_ID fails with a precise message
|
||||
// instead of an opaque acquire error.
|
||||
const availResp = await page.request.get(
|
||||
`${tenantURL}/workspaces/${workspaceId}/display`,
|
||||
);
|
||||
expect(
|
||||
availResp.status(),
|
||||
`GET /display for ${workspaceId} should be 200`,
|
||||
).toBe(200);
|
||||
const avail = await availResp.json();
|
||||
expect(
|
||||
avail.available,
|
||||
`workspace ${workspaceId} is not display-available (reason=${avail.reason}). ` +
|
||||
"STAGING_DISPLAY_WORKSPACE_ID must point at a workspace with " +
|
||||
"compute.display.mode == 'desktop-control' AND a live instance_id.",
|
||||
).toBe(true);
|
||||
|
||||
// 1. Acquire display control. The handler returns session_url +
|
||||
// expires_at; session_url embeds the signed token in its #token=
|
||||
// fragment (workspace_display_control.go:signedDisplaySessionURL).
|
||||
const acquireResp = await page.request.post(
|
||||
`${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
|
||||
{ data: { controller: "user", ttl_seconds: 300 } },
|
||||
);
|
||||
expect(
|
||||
acquireResp.status(),
|
||||
`acquire should be 200; body: ${await acquireResp.text()}`,
|
||||
).toBe(200);
|
||||
const acquire = await acquireResp.json();
|
||||
expect(acquire.controller, "controller should be 'user'").toBe("user");
|
||||
expect(
|
||||
typeof acquire.session_url,
|
||||
`acquire response missing session_url: ${JSON.stringify(acquire)}`,
|
||||
).toBe("string");
|
||||
|
||||
// The token rides in the URL fragment (#token=...), never as a query
|
||||
// param — confirm the contract the client (DisplayTab.tsx:459-466)
|
||||
// depends on so a server-side change to the URL shape fails HERE.
|
||||
const sessionUrl: string = acquire.session_url;
|
||||
expect(
|
||||
sessionUrl,
|
||||
`session_url should carry the token in a #token= fragment: ${sessionUrl}`,
|
||||
).toContain("#token=");
|
||||
|
||||
// 2. Open the REAL noVNC WebSocket from inside the page, so the browser
|
||||
// sends Origin: <tenant> and the same-origin-canvas AdminAuth path
|
||||
// accepts the upgrade (a browser WS can't set Authorization). We
|
||||
// navigate to the tenant origin first purely to anchor the Origin
|
||||
// header; we don't need the canvas bundle to hydrate.
|
||||
await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
|
||||
|
||||
// Reproduce DisplayTab.tsx:459-466 (displayWebSocketConnection): resolve
|
||||
// session_url against the tenant origin, pull the token out of the
|
||||
// fragment, strip the fragment, switch http(s)->ws(s). Then connect with
|
||||
// the exact subprotocols the canvas uses (DisplayTab.tsx:339).
|
||||
const result = await page.evaluate(
|
||||
async ({ rawSessionUrl, upgradeTimeoutMs, frameTimeoutMs }) => {
|
||||
const u = new URL(rawSessionUrl, window.location.href);
|
||||
const token =
|
||||
new URLSearchParams(u.hash.replace(/^#/, "")).get("token") ?? "";
|
||||
if (!token) {
|
||||
return { ok: false, stage: "token-parse", detail: "no #token in session_url" };
|
||||
}
|
||||
u.hash = "";
|
||||
u.protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
|
||||
const wsUrl = u.toString();
|
||||
|
||||
return await new Promise<{
|
||||
ok: boolean;
|
||||
stage: string;
|
||||
detail: string;
|
||||
frameBytes?: number;
|
||||
frameKind?: string;
|
||||
closeCode?: number;
|
||||
}>((resolve) => {
|
||||
let upgraded = false;
|
||||
let settled = false;
|
||||
const finish = (r: {
|
||||
ok: boolean;
|
||||
stage: string;
|
||||
detail: string;
|
||||
frameBytes?: number;
|
||||
frameKind?: string;
|
||||
closeCode?: number;
|
||||
}) => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
try {
|
||||
ws.close();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
resolve(r);
|
||||
};
|
||||
|
||||
let ws: WebSocket;
|
||||
try {
|
||||
ws = new WebSocket(wsUrl, [`binary`, `molecule-display-token.${token}`]);
|
||||
} catch (e) {
|
||||
resolve({ ok: false, stage: "construct", detail: String(e) });
|
||||
return;
|
||||
}
|
||||
ws.binaryType = "arraybuffer";
|
||||
|
||||
const upgradeTimer = setTimeout(() => {
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "upgrade-timeout",
|
||||
detail: `WS did not open within ${upgradeTimeoutMs}ms (readyState=${ws.readyState})`,
|
||||
});
|
||||
}, upgradeTimeoutMs);
|
||||
|
||||
let frameTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
|
||||
ws.onopen = () => {
|
||||
upgraded = true;
|
||||
clearTimeout(upgradeTimer);
|
||||
// Now wait for the first server message. RFB's ProtocolVersion
|
||||
// banner is the first thing x11vnc sends; if nothing arrives the
|
||||
// tunnel opened but the VNC server behind it is dead.
|
||||
frameTimer = setTimeout(() => {
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "frame-timeout",
|
||||
detail: `WS upgraded but no framebuffer message within ${frameTimeoutMs}ms`,
|
||||
});
|
||||
}, frameTimeoutMs);
|
||||
};
|
||||
|
||||
ws.onmessage = (ev) => {
|
||||
if (frameTimer) clearTimeout(frameTimer);
|
||||
let bytes = 0;
|
||||
let kind: string = typeof ev.data;
|
||||
if (ev.data instanceof ArrayBuffer) {
|
||||
bytes = ev.data.byteLength;
|
||||
kind = "ArrayBuffer";
|
||||
} else if (typeof Blob !== "undefined" && ev.data instanceof Blob) {
|
||||
bytes = ev.data.size;
|
||||
kind = "Blob";
|
||||
} else if (typeof ev.data === "string") {
|
||||
bytes = ev.data.length;
|
||||
kind = "string";
|
||||
}
|
||||
finish({
|
||||
ok: bytes > 0,
|
||||
stage: "frame",
|
||||
detail:
|
||||
bytes > 0
|
||||
? "received framebuffer message"
|
||||
: "first message was empty",
|
||||
frameBytes: bytes,
|
||||
frameKind: kind,
|
||||
});
|
||||
};
|
||||
|
||||
ws.onclose = (ev) => {
|
||||
// A close BEFORE open === failed upgrade (1006 abnormal / 403
|
||||
// forbidden surface here). A close AFTER we already saw a frame is
|
||||
// benign (our own finish() triggered it).
|
||||
if (!upgraded) {
|
||||
clearTimeout(upgradeTimer);
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "upgrade-close",
|
||||
detail: `WS closed before upgrade (code=${ev.code}, reason="${ev.reason}") — handshake rejected somewhere in edge → ws-proxy → EIC → websockify → x11vnc`,
|
||||
closeCode: ev.code,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
ws.onerror = () => {
|
||||
if (!upgraded) {
|
||||
clearTimeout(upgradeTimer);
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "upgrade-error",
|
||||
detail: "WS error before upgrade — proxy chain rejected the handshake",
|
||||
});
|
||||
}
|
||||
};
|
||||
});
|
||||
},
|
||||
{
|
||||
rawSessionUrl: sessionUrl,
|
||||
upgradeTimeoutMs: WS_UPGRADE_TIMEOUT_MS,
|
||||
frameTimeoutMs: FIRST_FRAME_TIMEOUT_MS,
|
||||
},
|
||||
);
|
||||
|
||||
// 3. Assert the real outcome. No "flaky" escape hatch: each failure stage
|
||||
// names the broken hop so a reviewer can act on it directly.
|
||||
expect(
|
||||
result.ok,
|
||||
`take-control failed at stage="${result.stage}": ${result.detail}` +
|
||||
(result.closeCode ? ` (close code ${result.closeCode})` : ""),
|
||||
).toBe(true);
|
||||
expect(
|
||||
result.stage,
|
||||
`expected to reach the 'frame' stage; got '${result.stage}' (${result.detail})`,
|
||||
).toBe("frame");
|
||||
expect(
|
||||
result.frameBytes ?? 0,
|
||||
`framebuffer message should be non-empty (kind=${result.frameKind})`,
|
||||
).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
+9
-151
@@ -241,14 +241,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
name: "E2E Canvas Test",
|
||||
runtime: "hermes",
|
||||
tier: 2,
|
||||
// Provider-registry SSOT (internal#718) registers ONLY Kimi models for
|
||||
// the hermes runtime — `moonshot/kimi-k2.6` is the platform-managed
|
||||
// entry (workspace-server/internal/providers/providers.yaml, hermes ->
|
||||
// platform). The old `gpt-4o` was never a registered hermes model and
|
||||
// now 422s UNREGISTERED_MODEL_FOR_RUNTIME (core#2225). This workspace
|
||||
// defaults closed to platform_managed (see the boot-shape note below),
|
||||
// so a platform-namespaced model id is the registry-correct choice.
|
||||
model: "moonshot/kimi-k2.6",
|
||||
model: "gpt-4o",
|
||||
}),
|
||||
});
|
||||
if (ws.status >= 400 || !ws.body?.id) {
|
||||
@@ -257,38 +250,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
const workspaceId = ws.body.id as string;
|
||||
console.log(`[staging-setup] Workspace created: ${workspaceId}`);
|
||||
|
||||
// 6. Wait for workspace RENDERABLE.
|
||||
//
|
||||
// This harness exists to verify the canvas *tab UI* renders (staging-
|
||||
// tabs.spec.ts: open each of the 13 workspace-panel tabs, assert no hard
|
||||
// crash / no "Failed to load" toast). It does NOT exercise the agent —
|
||||
// no LLM call is made, the spec even mocks /cp/auth/me and 401→200. All
|
||||
// it needs is a workspace ROW that the canvas lists so the node renders
|
||||
// and the side-panel tabs open. A fully-`online` agent is NOT required.
|
||||
//
|
||||
// That distinction became load-bearing on 2026-06-03: workspace-server
|
||||
// #2162 (fix(provision): platform-managed workspace must fail-closed when
|
||||
// CP proxy env absent) made a platform_managed workspace ABORT AT BOOT
|
||||
// with MISSING_PLATFORM_PROXY when MOLECULE_LLM_BASE_URL /
|
||||
// MOLECULE_LLM_USAGE_TOKEN are not present in the tenant's env. The
|
||||
// canvas E2E creates a bare hermes/moonshot platform workspace, which defaults
|
||||
// closed to platform_managed (workspace_provision.go:~1009), and the
|
||||
// staging tenant does not carry the CP proxy env — so the agent never
|
||||
// starts. Pre-#2162 this same workspace booted credential-less (the bug
|
||||
// #2162 fixed) and the tabs rendered fine; #2162 is a correct production
|
||||
// safety fix, but it surfaced here as `status:"failed", uptime_seconds:0,
|
||||
// last_sample_error:null` — the pre-start credential-abort shape — and the
|
||||
// old hard-throw turned a UI-irrelevant boot skip into a main-red
|
||||
// (core#2199). The agent boot stage is simply not what this test gates.
|
||||
//
|
||||
// So: online is the happy path. A `failed` row that is the PRE-START
|
||||
// credential-abort shape (the agent process never ran: uptime_seconds==0
|
||||
// AND no last_sample_error) is treated as RENDERABLE — the row exists,
|
||||
// the node + tabs render, proceed. We do NOT mask a real boot regression:
|
||||
// any `failed` carrying a last_sample_error, OR a non-zero uptime (the
|
||||
// agent started then crashed — image pull, panic, PYTHONPATH, etc.),
|
||||
// still hard-throws. Genuine *infra* provision failure is already caught
|
||||
// loud one step earlier at the org level (instance_status === "failed").
|
||||
// 6. Wait for workspace online
|
||||
await waitFor<boolean>(
|
||||
async () => {
|
||||
const r = await jsonFetch(`${tenantURL}/workspaces/${workspaceId}`, {
|
||||
@@ -297,24 +259,6 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
if (r.status !== 200) return null;
|
||||
if (r.body?.status === "online") return true;
|
||||
if (r.body?.status === "failed") {
|
||||
const uptime = Number(r.body?.uptime_seconds ?? 0);
|
||||
const sampleErr = r.body?.last_sample_error;
|
||||
const preStartCredentialAbort = uptime === 0 && !sampleErr;
|
||||
if (preStartCredentialAbort) {
|
||||
// Agent never started (no LLM cred on this staging tenant — the
|
||||
// expected #2162 platform-proxy gap). The workspace row still
|
||||
// renders, which is all the tab-UI test needs. Proceed, but log
|
||||
// loudly so a real "agent never booted because of something else"
|
||||
// is not silently normalized.
|
||||
console.warn(
|
||||
`[staging-setup] workspace ${workspaceId} is 'failed' with the pre-start ` +
|
||||
`credential-abort shape (uptime_seconds=0, no last_sample_error) — agent did ` +
|
||||
`not boot (expected on staging without CP LLM proxy env, post workspace-server ` +
|
||||
`#2162). The tab-UI test does not exercise the agent; proceeding with the ` +
|
||||
`workspace row, which renders regardless. full body: ${JSON.stringify(r.body)}`,
|
||||
);
|
||||
return true;
|
||||
}
|
||||
// last_sample_error is often empty when the failure happens before
|
||||
// the agent emits a sample (e.g. boot crash, image pull error,
|
||||
// missing PYTHONPATH, OpenAI quota at startup). Dumping the full
|
||||
@@ -322,8 +266,8 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
// needs without a second probe. Otherwise this propagates as a
|
||||
// bare "Workspace failed: " — the exact useless message that
|
||||
// sent #2632 to the issue tracker.
|
||||
const detail = sampleErr
|
||||
? sampleErr
|
||||
const detail = r.body.last_sample_error
|
||||
? r.body.last_sample_error
|
||||
: `(no last_sample_error) full body: ${JSON.stringify(r.body)}`;
|
||||
throw new Error(`Workspace failed: ${detail}`);
|
||||
}
|
||||
@@ -333,103 +277,17 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
10_000,
|
||||
"workspace online",
|
||||
);
|
||||
console.log(`[staging-setup] Workspace renderable`);
|
||||
console.log(`[staging-setup] Workspace online`);
|
||||
|
||||
// 7. Hand state off to tests + teardown — overwrite the slug-only
|
||||
// bootstrap state with the full state spec tests need.
|
||||
//
|
||||
// FAIL-CLOSED handoff: every field the spec reads must be non-empty. If
|
||||
// any is missing here, the spec's env-presence guard would throw with a
|
||||
// generic "did setup run?" message that hides WHICH field was lost. Catch
|
||||
// it at the source — a partial provision must hard-fail setup, never hand
|
||||
// off a half-built state that the spec then has to diagnose (or worse,
|
||||
// skip). This is the loud, fail-closed contract: STAGING was requested,
|
||||
// so an incomplete provision is an error, not a skip.
|
||||
const handoff = { slug, tenantURL, workspaceId, tenantToken };
|
||||
const missingFields = Object.entries(handoff)
|
||||
.filter(([, v]) => !v)
|
||||
.map(([k]) => k);
|
||||
if (missingFields.length > 0) {
|
||||
throw new Error(
|
||||
`[staging-setup] provision incomplete — empty handoff field(s): ` +
|
||||
`${missingFields.join(", ")}. Refusing to hand off a partial state ` +
|
||||
`that would surface downstream as an opaque spec failure.`,
|
||||
);
|
||||
}
|
||||
writeFileSync(stateFile, JSON.stringify(handoff, null, 2));
|
||||
writeFileSync(
|
||||
stateFile,
|
||||
JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2),
|
||||
);
|
||||
process.env.STAGING_SLUG = slug;
|
||||
process.env.STAGING_TENANT_URL = tenantURL;
|
||||
process.env.STAGING_WORKSPACE_ID = workspaceId;
|
||||
process.env.STAGING_TENANT_TOKEN = tenantToken;
|
||||
// The ephemeral org's UUID — exported so specs that route through the CP
|
||||
// edge can send X-Molecule-Org-Id (workspace-server TenantGuard). The tabs
|
||||
// harness hits the tenant box same-origin and doesn't need it, but the
|
||||
// take-control gate (staging-display.spec.ts) does.
|
||||
process.env.STAGING_ORG_ID = orgID;
|
||||
console.log(`[staging-setup] Ready — ${stateFile}`);
|
||||
|
||||
// 8. (core#2261 Gap 1) Resolve the STANDING desktop-capable org, if one is
|
||||
// configured, for the live take-control e2e (staging-display.spec.ts).
|
||||
//
|
||||
// This block is FULLY env-gated and additive: it provisions NOTHING and is
|
||||
// a no-op unless STAGING_DISPLAY_SLUG is set. We deliberately do NOT spin a
|
||||
// desktop workspace inside this shared setup — a desktop AMI boots in
|
||||
// ~12-15 min and would tax every tabs run. Instead an operator stands up
|
||||
// one always-on desktop org once (a CTO cost item) and points
|
||||
// STAGING_DISPLAY_SLUG + STAGING_DISPLAY_WORKSPACE_ID at it. Here we just
|
||||
// resolve that standing org's tenant URL, admin token, and org id so the
|
||||
// display spec can reach it. Fail-closed: if STAGING_DISPLAY_SLUG is set but
|
||||
// we can't resolve its token/id, we THROW — the gate must never silently
|
||||
// fall back to the (non-desktop) ephemeral org and pass.
|
||||
const displaySlug = process.env.STAGING_DISPLAY_SLUG;
|
||||
if (displaySlug) {
|
||||
console.log(`[staging-setup] Resolving standing desktop org: ${displaySlug}`);
|
||||
|
||||
// org id for the standing slug (admin-orgs row carries it + status).
|
||||
const orgsRes = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
|
||||
if (orgsRes.status !== 200) {
|
||||
throw new Error(
|
||||
`STAGING_DISPLAY_SLUG=${displaySlug} set, but GET /cp/admin/orgs returned ` +
|
||||
`${orgsRes.status} — cannot resolve the standing desktop org for the ` +
|
||||
`take-control gate.`,
|
||||
);
|
||||
}
|
||||
const displayRow = (orgsRes.body?.orgs || []).find(
|
||||
(o: any) => o.slug === displaySlug,
|
||||
);
|
||||
if (!displayRow?.id) {
|
||||
throw new Error(
|
||||
`STAGING_DISPLAY_SLUG=${displaySlug} not found in /cp/admin/orgs — the ` +
|
||||
`standing desktop org for the take-control gate does not exist. Provision ` +
|
||||
`it (one always-on desktop EC2) or unset STAGING_DISPLAY_SLUG/` +
|
||||
`STAGING_DISPLAY_WORKSPACE_ID to skip the gate.`,
|
||||
);
|
||||
}
|
||||
if (displayRow.instance_status !== "running") {
|
||||
throw new Error(
|
||||
`Standing desktop org ${displaySlug} is '${displayRow.instance_status}', ` +
|
||||
`not 'running' — the take-control gate needs a live desktop tenant. ` +
|
||||
`full row: ${JSON.stringify(displayRow)}`,
|
||||
);
|
||||
}
|
||||
|
||||
const displayTokRes = await jsonFetch(
|
||||
`${CP_URL}/cp/admin/orgs/${displaySlug}/admin-token`,
|
||||
{ headers: adminAuth },
|
||||
);
|
||||
if (displayTokRes.status !== 200 || !displayTokRes.body?.admin_token) {
|
||||
throw new Error(
|
||||
`admin-token fetch for standing desktop org ${displaySlug} returned ` +
|
||||
`${displayTokRes.status}: ${JSON.stringify(displayTokRes.body)}`,
|
||||
);
|
||||
}
|
||||
|
||||
process.env.STAGING_DISPLAY_ORG_ID = displayRow.id;
|
||||
process.env.STAGING_DISPLAY_TENANT_URL = `https://${displaySlug}.${TENANT_DOMAIN}`;
|
||||
process.env.STAGING_DISPLAY_TENANT_TOKEN = displayTokRes.body.admin_token;
|
||||
console.log(
|
||||
`[staging-setup] Standing desktop org resolved: ${displaySlug} ` +
|
||||
`(org_id=${displayRow.id}, url=${process.env.STAGING_DISPLAY_TENANT_URL})`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
+33
-305
@@ -1,8 +1,7 @@
|
||||
/**
|
||||
* Staging canvas E2E — opens each workspace-panel tab against a fresh
|
||||
* staging org provisioned in the global setup. Asserts each tab renders
|
||||
* REAL content (not an empty container, not an error state) and captures a
|
||||
* screenshot for visual review.
|
||||
* Staging canvas E2E — opens each of the 13 workspace-panel tabs against a
|
||||
* fresh staging org provisioned in the global setup. Asserts each tab
|
||||
* renders without throwing and captures a screenshot for visual review.
|
||||
*
|
||||
* Auth model: the tenant platform's AdminAuth middleware accepts a bearer
|
||||
* token OR a WorkOS session cookie. Playwright can't mint a WorkOS
|
||||
@@ -11,39 +10,17 @@
|
||||
* Bearer header via context.setExtraHTTPHeaders(). Every browser
|
||||
* request inherits the header.
|
||||
*
|
||||
* PROMOTION-READINESS (see § at bottom of file): this suite is being
|
||||
* hardened toward becoming a HARD merge-gate. It currently runs under
|
||||
* `continue-on-error: true` (RFC internal#219 §1, non-gating) — that is a
|
||||
* deliberate, CTO-owned call and is NOT changed here. The hardening makes
|
||||
* every assertion deterministic so that WHEN promotion happens the gate
|
||||
* does not flap. See the PROMOTION-READINESS block at the foot of this
|
||||
* file for what is now reliable and what still blocks promotion.
|
||||
*
|
||||
* Known SaaS gaps — documented in #1369. These tabs legitimately cannot
|
||||
* load real content in SaaS mode and are allowed an in-panel empty/error
|
||||
* state (NOT a hard crash, NOT an ErrorBoundary):
|
||||
* Known SaaS gaps — documented in #1369 and allowed to render errored
|
||||
* content without failing the test (the gate is "no hard crash, no
|
||||
* 'Failed to load' toast"):
|
||||
* - Files tab: empty (platform can't docker exec into a remote EC2)
|
||||
* - Terminal tab: WS connect fails
|
||||
* - Peers tab: 401 without workspace-scoped token
|
||||
* These are enumerated in KNOWN_DEGRADED_TABS below and asserted with a
|
||||
* weaker (but still non-trivial) contract: the panel renders and does not
|
||||
* crash the app. Every OTHER tab must render real content.
|
||||
*/
|
||||
|
||||
import { test, expect, type Page } from "@playwright/test";
|
||||
import { test, expect } from "@playwright/test";
|
||||
|
||||
// Tab ids as declared in canvas/src/components/SidePanel.tsx TABS.
|
||||
//
|
||||
// NOTE (drift guard): this list is asserted-complete against the live DOM
|
||||
// below (see "tab list parity" step) so it cannot silently drift out of
|
||||
// sync with SidePanel.tsx TABS the way a hand-maintained constant does.
|
||||
// `display` and `container-config` are intentionally EXCLUDED here:
|
||||
// - `display` is owned by the in-flight take-control e2e (PR #2275 /
|
||||
// staging-display.spec.ts); asserting it here would collide.
|
||||
// - `container-config` only renders when selectedNodeId is set AND is
|
||||
// gated on tier; it is covered by container-config-specific specs.
|
||||
// The parity check accounts for these via EXPECTED_EXTRA_TABS so a NEW
|
||||
// tab appearing in SidePanel still trips the guard.
|
||||
const TAB_IDS = [
|
||||
"chat",
|
||||
"activity",
|
||||
@@ -60,131 +37,12 @@ const TAB_IDS = [
|
||||
"audit",
|
||||
] as const;
|
||||
|
||||
// Tabs present in the DOM that this spec intentionally does not drive.
|
||||
// Keeping this explicit means a genuinely-new tab (not one of these) makes
|
||||
// the parity assertion fail LOUD instead of being silently un-tested.
|
||||
const EXPECTED_EXTRA_TABS = ["display", "container-config"] as const;
|
||||
|
||||
// Tabs that are KNOWN to degrade in SaaS mode (#1369). They get the weaker
|
||||
// "renders + no crash" contract instead of the "real content" contract.
|
||||
// Anything NOT in this set must render real content or the test fails.
|
||||
const KNOWN_DEGRADED_TABS = new Set<string>(["terminal", "files"]);
|
||||
|
||||
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
|
||||
|
||||
// IMPORTANT — fail-closed, not skip-green.
|
||||
//
|
||||
// `test.skip(!STAGING)` is correct ONLY when the operator never asked for a
|
||||
// staging run (CANVAS_E2E_STAGING unset). In that case the workflow's
|
||||
// detect-changes / token-check gates have already decided not to exercise
|
||||
// staging, and skipping is the documented contract.
|
||||
//
|
||||
// But if STAGING *is* requested (CANVAS_E2E_STAGING=1) and global setup did
|
||||
// NOT hand off the tenant state, that is a HARD failure, not a skip — see
|
||||
// the explicit env-presence throw inside the test body. A silent skip there
|
||||
// would let a broken provision ship green, which is exactly the
|
||||
// weak-gate failure this hardening removes (§ No flakes / internal#828).
|
||||
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — staging-only suite, not requested");
|
||||
|
||||
/**
|
||||
* Assert the panel for `tabId` rendered real content.
|
||||
*
|
||||
* Deterministic contract (no fixed waits — every step is condition-based
|
||||
* with Playwright's built-in retry / expect.poll):
|
||||
* 1. The tabpanel container is visible.
|
||||
* 2. The global ErrorBoundary did NOT trip ("Something went wrong").
|
||||
* 3. No visible error alert is shown in the panel.
|
||||
* 4. For non-degraded tabs: the panel settles to non-empty,
|
||||
* non-spinner content (so an empty <div/> or a stuck "Loading…"
|
||||
* spinner FAILS instead of passing as it did before).
|
||||
*/
|
||||
async function assertPanelRendered(page: Page, tabId: string): Promise<void> {
|
||||
const panel = page.locator(`#panel-${tabId}`);
|
||||
|
||||
// (1) Container visible. Built-in retry up to the expect timeout — no
|
||||
// arbitrary waitForTimeout. Mechanism: replaces any reliance on a fixed
|
||||
// settle delay with a real visibility condition.
|
||||
await expect(panel, `panel for ${tabId} never became visible`).toBeVisible({
|
||||
timeout: 10_000,
|
||||
});
|
||||
|
||||
// (2) ErrorBoundary trip = hard crash anywhere in the React subtree.
|
||||
// canvas/src/components/ErrorBoundary.tsx renders "Something went wrong".
|
||||
// The OLD gate only looked for a "Failed to load" toast and would ship
|
||||
// an ErrorBoundary-crashed panel GREEN. Mechanism: assert the crash
|
||||
// surface is absent, retried via expect.poll so a late-mounting crash
|
||||
// banner is still caught.
|
||||
await expect
|
||||
.poll(
|
||||
async () =>
|
||||
page.getByText("Something went wrong", { exact: false }).count(),
|
||||
{
|
||||
message: `tab ${tabId}: ErrorBoundary tripped (Something went wrong)`,
|
||||
timeout: 5_000,
|
||||
},
|
||||
)
|
||||
.toBe(0);
|
||||
|
||||
// (3) No visible error alert inside the panel. Tabs surface load errors
|
||||
// as role="alert" with the real error text (EventsTab/ChannelsTab/
|
||||
// ConfigTab/...). The OLD gate matched ONLY [role=alert]:has-text("Failed
|
||||
// to load") — it missed (a) error messages that don't contain that exact
|
||||
// phrase and (b) error divs that omit role="alert" entirely (e.g.
|
||||
// ActivityTab). We replace it with a broader, but still SaaS-gap-aware,
|
||||
// check: any *visible* alert OR red error banner inside the panel.
|
||||
//
|
||||
// Degraded tabs (#1369) are allowed an error state — for those we only
|
||||
// require no app-level crash (covered by step 2). For every other tab a
|
||||
// visible error alert is a real regression.
|
||||
if (!KNOWN_DEGRADED_TABS.has(tabId)) {
|
||||
const visibleAlerts = panel.locator('[role="alert"]:visible');
|
||||
await expect
|
||||
.poll(async () => visibleAlerts.count(), {
|
||||
message:
|
||||
`tab ${tabId}: a visible error alert is shown in the panel ` +
|
||||
`(was a weak "Failed to load"-only check before)`,
|
||||
timeout: 5_000,
|
||||
})
|
||||
.toBe(0);
|
||||
}
|
||||
|
||||
// (4) Real content. The tabpanel CONTAINER always mounts, so the old
|
||||
// toBeVisible() on the container passed even when the child rendered
|
||||
// nothing. Assert the panel's trimmed innerText is non-empty AND not
|
||||
// stuck on a loading spinner. expect.poll retries until the async
|
||||
// fetch+render settles — replacing the implicit "the network finished
|
||||
// by now" timing assumption with an explicit polled condition.
|
||||
//
|
||||
// Degraded tabs may legitimately be empty (Files in SaaS mode), so they
|
||||
// are exempt from the non-empty requirement; step 2 still guards them
|
||||
// against a hard crash.
|
||||
if (!KNOWN_DEGRADED_TABS.has(tabId)) {
|
||||
await expect
|
||||
.poll(
|
||||
async () => {
|
||||
const text = ((await panel.innerText()) || "").trim();
|
||||
// A panel still showing only a loading spinner has not settled.
|
||||
const stillLoading = /^(loading\b|loading…|loading\.\.\.)/i.test(
|
||||
text,
|
||||
);
|
||||
return text.length > 0 && !stillLoading;
|
||||
},
|
||||
{
|
||||
message:
|
||||
`tab ${tabId}: panel rendered empty or stuck on a loading ` +
|
||||
`spinner — no real content settled (weak "container visible" ` +
|
||||
`gate would have passed this)`,
|
||||
// Generous: real tabs fetch from the tenant over the network.
|
||||
// Polled, so it returns as soon as content appears.
|
||||
timeout: 20_000,
|
||||
},
|
||||
)
|
||||
.toBe(true);
|
||||
}
|
||||
}
|
||||
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
|
||||
|
||||
test.describe("staging canvas tabs", () => {
|
||||
test("each workspace-panel tab renders real content", async ({
|
||||
test("each workspace-panel tab renders without error", async ({
|
||||
page,
|
||||
context,
|
||||
}) => {
|
||||
@@ -192,16 +50,9 @@ test.describe("staging canvas tabs", () => {
|
||||
const tenantToken = process.env.STAGING_TENANT_TOKEN;
|
||||
const workspaceId = process.env.STAGING_WORKSPACE_ID;
|
||||
|
||||
// FAIL-CLOSED (not skip): STAGING was requested but global setup did
|
||||
// not export tenant state. A silent skip here would paint a broken
|
||||
// provision GREEN. This is the loud-fail the hardening mandates.
|
||||
if (!tenantURL || !tenantToken || !workspaceId) {
|
||||
throw new Error(
|
||||
"staging-setup.ts did not export STAGING_TENANT_URL / " +
|
||||
"STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID. CANVAS_E2E_STAGING=1 " +
|
||||
"was set (staging WAS requested) but global setup produced no " +
|
||||
"tenant — this is a provisioning failure, NOT a reason to skip. " +
|
||||
"Check the [staging-setup] log above for the real error.",
|
||||
"staging-setup.ts did not export STAGING_TENANT_URL / STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID — did global setup run?",
|
||||
);
|
||||
}
|
||||
|
||||
@@ -301,19 +152,11 @@ test.describe("staging canvas tabs", () => {
|
||||
// omit the URL, so we'd otherwise be flying blind. Logged to the
|
||||
// test's stdout (visible in the workflow log under the failed step).
|
||||
page.on("requestfailed", (req) => {
|
||||
console.log(
|
||||
`[e2e/requestfailed] ${req.method()} ${req.url()}: ${
|
||||
req.failure()?.errorText ?? "?"
|
||||
}`,
|
||||
);
|
||||
console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
|
||||
});
|
||||
page.on("response", (res) => {
|
||||
if (res.status() >= 400) {
|
||||
console.log(
|
||||
`[e2e/response-${res.status()}] ${res
|
||||
.request()
|
||||
.method()} ${res.url()}`,
|
||||
);
|
||||
console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -330,8 +173,9 @@ test.describe("staging canvas tabs", () => {
|
||||
// hydrated, even with zero workspaces) or the hydration-error
|
||||
// banner — whichever wins first. Previous version of this wait
|
||||
// used `[role="tablist"]`, but that selector only appears AFTER
|
||||
// a workspace node is clicked, so the wait would always time out
|
||||
// at 45s before any meaningful failure surfaced.
|
||||
// a workspace node is clicked (which happens below at L100), so
|
||||
// the wait would always time out at 45s before any meaningful
|
||||
// failure surfaced.
|
||||
await page.waitForSelector(
|
||||
'[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
|
||||
{ timeout: 45_000 },
|
||||
@@ -345,20 +189,10 @@ test.describe("staging canvas tabs", () => {
|
||||
"canvas hydration failed — check staging CP + tenant reachability",
|
||||
).toBe(0);
|
||||
|
||||
// The global ErrorBoundary must not have tripped at the app root
|
||||
// either — a crash before the side panel even opens would otherwise
|
||||
// be invisible until a tab assertion happened to notice it.
|
||||
await expect(
|
||||
page.getByText("Something went wrong", { exact: false }),
|
||||
"app-level ErrorBoundary tripped during hydration",
|
||||
).toHaveCount(0);
|
||||
|
||||
// Click the workspace node to open the side panel. Try a data
|
||||
// attribute first, fall back to a generic role-based selector so
|
||||
// the test doesn't break when the node-card markup changes.
|
||||
const byDataAttr = page
|
||||
.locator(`[data-workspace-id="${workspaceId}"]`)
|
||||
.first();
|
||||
const byDataAttr = page.locator(`[data-workspace-id="${workspaceId}"]`).first();
|
||||
if ((await byDataAttr.count()) > 0) {
|
||||
await byDataAttr.click({ timeout: 10_000 });
|
||||
} else {
|
||||
@@ -368,56 +202,19 @@ test.describe("staging canvas tabs", () => {
|
||||
await firstNode.click({ timeout: 10_000 });
|
||||
}
|
||||
|
||||
// The tablist appears once the side panel mounts. Condition-based
|
||||
// wait — no fixed delay.
|
||||
const tablist = page.locator('[role="tablist"]');
|
||||
await expect(
|
||||
tablist,
|
||||
"side panel tablist never appeared after clicking the workspace node",
|
||||
).toBeVisible({ timeout: 15_000 });
|
||||
|
||||
// Tab-list parity guard. The hand-maintained TAB_IDS constant used to
|
||||
// be able to drift silently out of sync with SidePanel.tsx TABS — a
|
||||
// tab could be added to the UI and never get an assertion, shipping
|
||||
// broken-but-untested. Read the actual tab ids from the DOM and assert
|
||||
// every live tab is either driven by this spec (TAB_IDS) or explicitly
|
||||
// excluded (EXPECTED_EXTRA_TABS). A genuinely-new tab fails LOUD.
|
||||
const liveTabIds = (
|
||||
await tablist.locator('[role="tab"][id^="tab-"]').evaluateAll((els) =>
|
||||
els.map((el) => el.id.replace(/^tab-/, "")),
|
||||
)
|
||||
).sort();
|
||||
const accountedFor = new Set<string>([
|
||||
...TAB_IDS,
|
||||
...EXPECTED_EXTRA_TABS,
|
||||
]);
|
||||
const unaccounted = liveTabIds.filter((id) => !accountedFor.has(id));
|
||||
expect(
|
||||
unaccounted,
|
||||
`SidePanel exposes tab(s) this spec neither drives nor excludes: ` +
|
||||
`${unaccounted.join(", ")}. Add them to TAB_IDS (and assert their ` +
|
||||
`content) or to EXPECTED_EXTRA_TABS with a reason.`,
|
||||
).toHaveLength(0);
|
||||
// And the inverse: every TAB_ID we intend to drive must actually exist
|
||||
// in the DOM, so a renamed/removed tab fails here instead of timing out
|
||||
// on a missing #tab-<id> selector with an opaque message.
|
||||
const missing = TAB_IDS.filter((id) => !liveTabIds.includes(id));
|
||||
expect(
|
||||
missing,
|
||||
`TAB_IDS references tab(s) not present in SidePanel: ${missing.join(
|
||||
", ",
|
||||
)} — the spec's tab list has drifted from SidePanel.tsx TABS.`,
|
||||
).toHaveLength(0);
|
||||
await page.waitForSelector('[role="tablist"]', { timeout: 15_000 });
|
||||
|
||||
for (const tabId of TAB_IDS) {
|
||||
await test.step(`tab: ${tabId}`, async () => {
|
||||
const tabButton = page.locator(`#tab-${tabId}`);
|
||||
// The TABS bar is `overflow-x-auto` — tabs past position ~3 are
|
||||
// clipped behind the right-edge fade gradient on smaller
|
||||
// viewports. Playwright's toBeVisible() returns false for clipped
|
||||
// elements, so a bare visibility check fails on later tabs in CI.
|
||||
// scrollIntoViewIfNeeded brings the button into view before the
|
||||
// visibility check.
|
||||
// The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
|
||||
// wrapper) — tabs after position ~3 are clipped behind the
|
||||
// right-edge fade gradient on smaller viewports. Playwright's
|
||||
// `toBeVisible()` returns false for clipped elements, so a
|
||||
// bare visibility check fails on `skills` and later tabs in
|
||||
// CI. scrollIntoViewIfNeeded brings the button into view
|
||||
// before the visibility check, mirroring what SidePanel's own
|
||||
// keyboard handler does on arrow-key navigation.
|
||||
await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
|
||||
await expect(
|
||||
tabButton,
|
||||
@@ -425,34 +222,18 @@ test.describe("staging canvas tabs", () => {
|
||||
).toBeVisible({ timeout: 5_000 });
|
||||
await tabButton.click();
|
||||
|
||||
// Confirm the click actually activated this tab before asserting
|
||||
// its content — aria-selected flips on the active tab. This closes
|
||||
// a race where a slow click handler left the PREVIOUS tab's panel
|
||||
// mounted and we asserted the wrong panel's content. Built-in
|
||||
// retry, condition-based, no fixed wait.
|
||||
await expect(
|
||||
tabButton,
|
||||
`tab-${tabId} did not become the selected tab after click`,
|
||||
).toHaveAttribute("aria-selected", "true", { timeout: 5_000 });
|
||||
const panel = page.locator(`#panel-${tabId}`);
|
||||
await expect(panel, `panel for ${tabId} never rendered`).toBeVisible({
|
||||
timeout: 10_000,
|
||||
});
|
||||
|
||||
// Real-content assertion (the core hardening). See
|
||||
// assertPanelRendered: container visible + no ErrorBoundary + no
|
||||
// visible error alert + settled non-empty content for non-degraded
|
||||
// tabs. Replaces the old "panel visible + no Failed-to-load toast"
|
||||
// pair, which shipped empty/errored panels green.
|
||||
await assertPanelRendered(page, tabId);
|
||||
|
||||
// Belt to the braces: the original toast check stays. A global
|
||||
// "Failed to load" toast (role=alert outside the panel) is still a
|
||||
// crash signal worth catching even though the in-panel checks above
|
||||
// now do the heavy lifting.
|
||||
// "Failed to load" toast = hard crash. Known SaaS-mode gaps
|
||||
// (Files empty, Terminal disconnected, Peers 401) surface as
|
||||
// in-panel content, not toasts.
|
||||
const errorToasts = await page
|
||||
.locator('[role="alert"]:has-text("Failed to load")')
|
||||
.count();
|
||||
expect(
|
||||
errorToasts,
|
||||
`tab ${tabId}: a global "Failed to load" toast is showing`,
|
||||
).toBe(0);
|
||||
expect(errorToasts, `tab ${tabId}: "Failed to load" toast`).toBe(0);
|
||||
|
||||
await page.screenshot({
|
||||
path: `test-results/staging-tab-${tabId}.png`,
|
||||
@@ -486,56 +267,3 @@ test.describe("staging canvas tabs", () => {
|
||||
).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
/*
|
||||
* PROMOTION-READINESS — staging canvas E2E → HARD merge-gate
|
||||
* ----------------------------------------------------------
|
||||
* NOW RELIABLE (deterministic; these no longer flap on timing):
|
||||
* - Every wait is condition-based (toBeVisible / toHaveAttribute /
|
||||
* expect.poll). There is NO fixed waitForTimeout / sleep in the spec;
|
||||
* the only setTimeout is the bounded poll-interval inside
|
||||
* staging-setup.ts waitFor(), which has a hard deadline.
|
||||
* - Tabs are asserted on REAL settled content (non-empty, non-spinner),
|
||||
* not just "container is visible" — an empty or stuck-loading panel now
|
||||
* fails instead of shipping green.
|
||||
* - The ErrorBoundary ("Something went wrong") is asserted absent at app
|
||||
* hydration AND per tab — a React subtree crash can no longer pass.
|
||||
* - Visible error alerts inside a panel fail non-degraded tabs (was a
|
||||
* weak [role=alert]:has-text("Failed to load")-only check that missed
|
||||
* both other error phrasings and role-less error divs).
|
||||
* - The driven tab list is parity-checked against the live DOM, so a new
|
||||
* SidePanel tab can't ship un-tested and a removed one fails loud.
|
||||
* - Click→activation is confirmed (aria-selected) before asserting the
|
||||
* panel, removing a wrong-panel race.
|
||||
* - The suite is fail-closed: CANVAS_E2E_STAGING=1 with no tenant state
|
||||
* hard-errors (never skips→green); CANVAS_E2E_STAGING unset cleanly
|
||||
* skips (operator did not request staging).
|
||||
*
|
||||
* STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT flip continue-on-error here —
|
||||
* CTO-owned, RFC internal#219 §1):
|
||||
* - INFRA DEPENDENCY: each run provisions a real staging EC2 tenant
|
||||
* (12-20 min cold boot). Required-gate latency + AWS/Cloudflare/CP
|
||||
* availability become merge-blockers. A staging outage would freeze
|
||||
* main even though the code is fine — unacceptable for a required check
|
||||
* until staging has an SLA or this runs against a warm pre-provisioned
|
||||
* pool.
|
||||
* - SHARED-RESOURCE FLAKE SURFACE: TLS/DNS/ACME propagation on a shared
|
||||
* staging zone (staging-setup TLS_TIMEOUT_MS) is outside this repo's
|
||||
* control. Deterministic here ≠ deterministic upstream.
|
||||
* - SECRET DEPENDENCY: CP_STAGING_ADMIN_API_TOKEN must be present on the
|
||||
* runner. The workflow's skip-if-absent (core#2225) keeps a missing
|
||||
* secret from painting red — correct for non-gating, but a REQUIRED
|
||||
* check must instead guarantee the secret is always present, else it
|
||||
* skip-greens the very thing it is supposed to enforce.
|
||||
* - SINGLE-WORKSPACE COVERAGE: one hermes/platform_managed workspace that
|
||||
* does NOT boot an agent on staging (no CP LLM proxy env, workspace-
|
||||
* server #2162). Tabs render, but agent-dependent content paths (live
|
||||
* chat round-trip, traces from a real run) are not exercised.
|
||||
*
|
||||
* PROMOTION CHECKLIST (when CTO signs off on making this required):
|
||||
* 1. Warm pre-provisioned tenant pool OR a staging SLA bounding boot time.
|
||||
* 2. Guarantee CP_STAGING_ADMIN_API_TOKEN on the gating runner; turn the
|
||||
* skip-if-absent into a hard error for the required path.
|
||||
* 3. Decide whether agent-dependent tabs need a wired LLM proxy on the
|
||||
* staging tenant (covers chat/traces real content) before gating them.
|
||||
*/
|
||||
|
||||
@@ -7,14 +7,6 @@ export default defineConfig({
|
||||
fullyParallel: false,
|
||||
workers: 1,
|
||||
retries: 0,
|
||||
// Fail CLOSED when an explicit spec selection matches zero tests.
|
||||
// Playwright defaults this to true, so `playwright test e2e/chat-*.spec.ts`
|
||||
// would exit 0 (green) if those files were renamed/moved/deleted — a
|
||||
// false-green that would silently gut the e2e-chat gate after a refactor.
|
||||
// forbidOnly likewise stops a stray `test.only` from green-ing the suite
|
||||
// while skipping every other case.
|
||||
passWithNoTests: false,
|
||||
forbidOnly: !!process.env.CI,
|
||||
use: {
|
||||
baseURL: process.env.PLAYWRIGHT_BASE_URL || "http://localhost:3000",
|
||||
headless: true,
|
||||
|
||||
@@ -1,17 +1,12 @@
|
||||
/**
|
||||
* Canvas /api/buildinfo — version-display endpoint mirroring
|
||||
* workspace-server's /buildinfo. Lets `curl <url>/api/buildinfo`
|
||||
* confirm which git SHA is live on a canvas deployment (core#2235).
|
||||
* confirm which git SHA is live on a canvas deployment.
|
||||
*/
|
||||
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||
import { GET } from "../route";
|
||||
|
||||
const ENV_KEYS = [
|
||||
"BUILD_SHA",
|
||||
"VERCEL_GIT_COMMIT_SHA",
|
||||
"VERCEL_GIT_COMMIT_REF",
|
||||
"VERCEL_ENV",
|
||||
];
|
||||
const ENV_KEYS = ["VERCEL_GIT_COMMIT_SHA", "VERCEL_GIT_COMMIT_REF", "VERCEL_ENV"];
|
||||
|
||||
describe("GET /api/buildinfo", () => {
|
||||
let saved: Record<string, string | undefined>;
|
||||
@@ -28,24 +23,13 @@ describe("GET /api/buildinfo", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("returns dev sentinel when no SHA source is set", async () => {
|
||||
it("returns dev sentinel when Vercel env vars are unset", async () => {
|
||||
const res = await GET();
|
||||
const body = await res.json();
|
||||
expect(body).toEqual({ git_sha: "dev", git_ref: "", vercel_env: "local" });
|
||||
});
|
||||
|
||||
it("reports BUILD_SHA baked into the Docker image (fleet deploy path)", async () => {
|
||||
// BUILD_SHA is the authoritative source for the ECR-image fleet deploy,
|
||||
// which never runs on Vercel. It must win even when a Vercel var is also
|
||||
// present in the environment.
|
||||
process.env.BUILD_SHA = "deadbeefcafe";
|
||||
process.env.VERCEL_GIT_COMMIT_SHA = "should-not-win";
|
||||
const res = await GET();
|
||||
const body = await res.json();
|
||||
expect(body.git_sha).toBe("deadbeefcafe");
|
||||
});
|
||||
|
||||
it("falls back to the SHA Vercel injected when BUILD_SHA is unset", async () => {
|
||||
it("reports the SHA Vercel injected at build time", async () => {
|
||||
process.env.VERCEL_GIT_COMMIT_SHA = "abc1234567890";
|
||||
process.env.VERCEL_GIT_COMMIT_REF = "main";
|
||||
process.env.VERCEL_ENV = "production";
|
||||
|
||||
@@ -1,36 +1,17 @@
|
||||
import { NextResponse } from "next/server";
|
||||
|
||||
// Mirror of workspace-server's GET /buildinfo (PR #2398). Lets a developer
|
||||
// or the fleet redeploy workflow confirm which git SHA is live on a canvas
|
||||
// deployment with the same `curl <url>/api/buildinfo` flow used against
|
||||
// tenant workspaces (core#2235; cross-ref core#2226).
|
||||
// confirm which git SHA is live on a canvas deployment with the same
|
||||
// `curl <url>/buildinfo` flow they use against tenant workspaces.
|
||||
//
|
||||
// SHA source, in priority order:
|
||||
// 1. BUILD_SHA — server-only env baked into the canvas Docker image at
|
||||
// build time (Dockerfile `ARG BUILD_SHA` → `ENV BUILD_SHA`, wired
|
||||
// from `${{ github.sha }}` in publish-canvas-image.yml). This is the
|
||||
// authoritative source for the fleet's ECR-image deploy path, which
|
||||
// does NOT run on Vercel. Read server-side here (App Router route
|
||||
// handler runs on the standalone Node server, `output: "standalone"`),
|
||||
// so it is intentionally NOT a NEXT_PUBLIC_ var — keeping it out of
|
||||
// the client bundle.
|
||||
// 2. VERCEL_GIT_COMMIT_SHA — Vercel injects this at build time when the
|
||||
// canvas is deployed via Vercel rather than the Docker image.
|
||||
// 3. "dev" — local `next dev` / test harness, where neither is set. Same
|
||||
// sentinel workspace-server uses pre-ldflags-injection, so both
|
||||
// surfaces speak the same vocabulary and an unconfigured deploy
|
||||
// fails the SHA comparison closed instead of round-tripping "".
|
||||
//
|
||||
// force-dynamic so the response is evaluated at request time against the
|
||||
// runtime env of the standalone server (where ENV BUILD_SHA lives), not
|
||||
// frozen into a static asset at `next build`.
|
||||
export const dynamic = "force-dynamic";
|
||||
|
||||
// Vercel injects VERCEL_GIT_COMMIT_SHA / _REF / VERCEL_ENV at build time
|
||||
// from the deploying commit; outside Vercel (local `next dev`, harness)
|
||||
// these are unset and the endpoint reports `git_sha: "dev"`. Same sentinel
|
||||
// the workspace-server uses pre-ldflags-injection so both surfaces speak
|
||||
// the same vocabulary.
|
||||
export async function GET() {
|
||||
const sha =
|
||||
process.env.BUILD_SHA ?? process.env.VERCEL_GIT_COMMIT_SHA ?? "dev";
|
||||
return NextResponse.json({
|
||||
git_sha: sha,
|
||||
git_sha: process.env.VERCEL_GIT_COMMIT_SHA ?? "dev",
|
||||
git_ref: process.env.VERCEL_GIT_COMMIT_REF ?? "",
|
||||
vercel_env: process.env.VERCEL_ENV ?? "local",
|
||||
});
|
||||
|
||||
@@ -8,13 +8,9 @@ import { ExternalConnectModal, type ExternalConnectionInfo } from "./ExternalCon
|
||||
import {
|
||||
ProviderModelSelector,
|
||||
buildProviderCatalog,
|
||||
buildProviderCatalogFromRegistry,
|
||||
findProviderForModel,
|
||||
isPlatformManagedProvider,
|
||||
type SelectorModel,
|
||||
type SelectorValue,
|
||||
type RegistryProvider,
|
||||
type RegistryModel,
|
||||
} from "./ProviderModelSelector";
|
||||
|
||||
interface WorkspaceOption {
|
||||
@@ -36,16 +32,6 @@ interface TemplateSpec {
|
||||
model?: string;
|
||||
models?: SelectorModel[];
|
||||
providers?: string[];
|
||||
// internal#718 P3 registry-served fields (additive; absent on older
|
||||
// backends and for non-registry runtimes). When registry_backed is true the
|
||||
// provider→model catalog is built from registry_providers/registry_models so
|
||||
// each model's DERIVED provider (e.g. moonshot/kimi-k2.6 → "platform") drives
|
||||
// the dropdown bucket and the create payload's llm_provider — instead of the
|
||||
// legacy inferVendor heuristic that slash-splits the id into "moonshot".
|
||||
// Mirrors ConfigTab's RuntimeOption loader (RFC#340 Fix C).
|
||||
registry_backed?: boolean;
|
||||
registry_providers?: RegistryProvider[];
|
||||
registry_models?: RegistryModel[];
|
||||
}
|
||||
|
||||
const DEFAULT_RUNTIME = "claude-code";
|
||||
@@ -182,53 +168,15 @@ export function CreateWorkspaceButton() {
|
||||
}),
|
||||
[runtime, templateSpecs],
|
||||
);
|
||||
// The /templates row backing the LLM picker: an explicitly-selected
|
||||
// workspace template wins, else the base runtime template row.
|
||||
const llmSourceSpec = useMemo<TemplateSpec | null>(
|
||||
() => selectedTemplateSpec ?? selectedRuntimeTemplateSpec,
|
||||
const llmModels = useMemo(
|
||||
() => {
|
||||
const sourceSpec = selectedTemplateSpec ?? selectedRuntimeTemplateSpec;
|
||||
if (!sourceSpec?.models?.length) return [];
|
||||
return sourceSpec.models;
|
||||
},
|
||||
[selectedRuntimeTemplateSpec, selectedTemplateSpec],
|
||||
);
|
||||
// internal#718 P3 / RFC#340 Fix C: a runtime is registry-backed when the
|
||||
// /templates row says so AND it served a non-empty registry_models set.
|
||||
// Mirrors ConfigTab's `registryBacked` derivation exactly.
|
||||
const registryBacked = useMemo(
|
||||
() =>
|
||||
llmSourceSpec?.registry_backed === true &&
|
||||
(llmSourceSpec.registry_models?.length ?? 0) > 0,
|
||||
[llmSourceSpec],
|
||||
);
|
||||
// Models fed to the selector dropdown. For a registry-backed runtime use the
|
||||
// registry-served native set, carrying each model's DERIVED provider so the
|
||||
// selector buckets it correctly (moonshot/kimi-k2.6 → "platform", not the
|
||||
// inferVendor "moonshot"). Otherwise fall back to the template-served
|
||||
// models[] + the legacy heuristic — same fallback ConfigTab keeps.
|
||||
const llmModels = useMemo<SelectorModel[]>(
|
||||
() => {
|
||||
if (registryBacked) {
|
||||
return (llmSourceSpec?.registry_models ?? []).map((m) => ({
|
||||
id: m.id,
|
||||
name: m.name,
|
||||
...(m.provider ? { provider: m.provider } : {}),
|
||||
}));
|
||||
}
|
||||
return llmSourceSpec?.models?.length ? llmSourceSpec.models : [];
|
||||
},
|
||||
[registryBacked, llmSourceSpec],
|
||||
);
|
||||
// Registry-backed path: build the catalog from registry_providers/
|
||||
// registry_models so dropdown labels + billing + the derived provider come
|
||||
// from the provider-registry SSOT (restores the "Platform" bucket). Legacy
|
||||
// path: re-infer from models[] via buildProviderCatalog (inferVendor).
|
||||
const llmCatalog = useMemo(
|
||||
() =>
|
||||
registryBacked
|
||||
? buildProviderCatalogFromRegistry(
|
||||
llmSourceSpec?.registry_providers ?? [],
|
||||
llmSourceSpec?.registry_models ?? [],
|
||||
)
|
||||
: buildProviderCatalog(llmModels),
|
||||
[registryBacked, llmSourceSpec, llmModels],
|
||||
);
|
||||
const llmCatalog = useMemo(() => buildProviderCatalog(llmModels), [llmModels]);
|
||||
const selectedLLMProvider = useMemo(
|
||||
() => llmCatalog.find((p) => p.id === llmSelection.providerId) ?? llmCatalog[0],
|
||||
[llmCatalog, llmSelection.providerId],
|
||||
@@ -236,7 +184,7 @@ export function CreateWorkspaceButton() {
|
||||
|
||||
useEffect(() => {
|
||||
if (llmCatalog.length === 0) return;
|
||||
const sourceDefault = llmSourceSpec?.model?.trim();
|
||||
const sourceDefault = (selectedTemplateSpec ?? selectedRuntimeTemplateSpec)?.model?.trim();
|
||||
const platformProvider = llmCatalog.find((p) => p.vendor === "platform");
|
||||
const matched = sourceDefault ? findProviderForModel(llmCatalog, sourceDefault) : null;
|
||||
const next = platformProvider ?? matched ?? llmCatalog[0];
|
||||
@@ -249,7 +197,7 @@ export function CreateWorkspaceButton() {
|
||||
envVars: next.envVars,
|
||||
});
|
||||
setLLMSecret("");
|
||||
}, [llmCatalog, llmSourceSpec]);
|
||||
}, [llmCatalog, selectedRuntimeTemplateSpec, selectedTemplateSpec]);
|
||||
|
||||
// Reset form and load workspaces whenever dialog opens
|
||||
useEffect(() => {
|
||||
@@ -291,15 +239,7 @@ export function CreateWorkspaceButton() {
|
||||
setError("Model is required");
|
||||
return;
|
||||
}
|
||||
// Platform-managed providers need NO user credential — the platform injects
|
||||
// its own usage token (MOLECULE_LLM_USAGE_TOKEN = tenant admin_token) at
|
||||
// provision time. Only BYOK providers require a user-supplied key. (#2245)
|
||||
if (
|
||||
!isExternal &&
|
||||
!isPlatformManagedProvider(selectedLLMProvider) &&
|
||||
selectedLLMProvider?.envVars.length &&
|
||||
!llmSecret.trim()
|
||||
) {
|
||||
if (!isExternal && selectedLLMProvider?.envVars.length && !llmSecret.trim()) {
|
||||
setError("Provider credential is required");
|
||||
return;
|
||||
}
|
||||
@@ -334,11 +274,7 @@ export function CreateWorkspaceButton() {
|
||||
? {
|
||||
model: llmSelection.model.trim(),
|
||||
llm_provider: nativeProvider.vendor,
|
||||
// Only BYOK providers carry a user secret. For platform-managed
|
||||
// the token is provisioner-injected; sending an (empty) secret
|
||||
// here would clobber it — so omit it entirely. (#2245)
|
||||
...(nativeProvider.envVars.length > 0 &&
|
||||
!isPlatformManagedProvider(nativeProvider)
|
||||
...(nativeProvider.envVars.length > 0
|
||||
? { secrets: { [nativeProvider.envVars[0]]: llmSecret.trim() } }
|
||||
: {}),
|
||||
}
|
||||
@@ -525,7 +461,6 @@ export function CreateWorkspaceButton() {
|
||||
</div>
|
||||
<ProviderModelSelector
|
||||
models={llmModels}
|
||||
catalog={registryBacked ? llmCatalog : undefined}
|
||||
value={llmSelection}
|
||||
onChange={(next) => {
|
||||
setLLMSelection(next);
|
||||
@@ -534,26 +469,20 @@ export function CreateWorkspaceButton() {
|
||||
idPrefix="create-workspace-llm"
|
||||
variant="stack"
|
||||
/>
|
||||
{isPlatformManagedProvider(selectedLLMProvider) ? (
|
||||
<div className="text-[11px] text-ink-soft">
|
||||
Platform-managed — no API key required.
|
||||
{selectedLLMProvider.envVars.length > 0 && (
|
||||
<div>
|
||||
<label htmlFor="llm-secret-input" className="text-[11px] text-ink-mid block mb-1">
|
||||
{selectedLLMProvider.envVars[0]}
|
||||
</label>
|
||||
<input
|
||||
id="llm-secret-input"
|
||||
type="password"
|
||||
value={llmSecret}
|
||||
onChange={(e) => setLLMSecret(e.target.value)}
|
||||
autoComplete="off"
|
||||
className="w-full bg-surface-card/60 border border-line/50 rounded-lg px-3 py-2 text-sm text-ink placeholder-ink-soft focus:outline-none focus:border-accent/60 focus:ring-1 focus:ring-accent/20 transition-colors font-mono"
|
||||
/>
|
||||
</div>
|
||||
) : (
|
||||
selectedLLMProvider.envVars.length > 0 && (
|
||||
<div>
|
||||
<label htmlFor="llm-secret-input" className="text-[11px] text-ink-mid block mb-1">
|
||||
{selectedLLMProvider.envVars[0]}
|
||||
</label>
|
||||
<input
|
||||
id="llm-secret-input"
|
||||
type="password"
|
||||
value={llmSecret}
|
||||
onChange={(e) => setLLMSecret(e.target.value)}
|
||||
autoComplete="off"
|
||||
className="w-full bg-surface-card/60 border border-line/50 rounded-lg px-3 py-2 text-sm text-ink placeholder-ink-soft focus:outline-none focus:border-accent/60 focus:ring-1 focus:ring-accent/20 transition-colors font-mono"
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
@@ -55,21 +55,6 @@ export interface ProviderEntry {
|
||||
billingMode?: "platform_managed" | "byok";
|
||||
}
|
||||
|
||||
/** A provider is "platform-managed" when the Molecule platform proxies the LLM
|
||||
* call and injects its own usage credential — the tenant admin_token, surfaced
|
||||
* to the workspace as MOLECULE_LLM_USAGE_TOKEN by the CP provisioner
|
||||
* (controlplane ec2.go: `MOLECULE_LLM_USAGE_TOKEN="$ADMIN_TOKEN"`). The user
|
||||
* supplies NO key for these: the credential is internal plumbing, not a user
|
||||
* input. Detected by vendor==="platform" (the platform proxy provider, which
|
||||
* declares MOLECULE_LLM_USAGE_TOKEN in its AuthEnv) OR
|
||||
* billingMode==="platform_managed" (registry-backed, internal#718 P3). BYOK
|
||||
* providers return false and DO require a user-supplied credential. */
|
||||
export function isPlatformManagedProvider(
|
||||
p?: Pick<ProviderEntry, "vendor" | "billingMode"> | null,
|
||||
): boolean {
|
||||
return p?.vendor === "platform" || p?.billingMode === "platform_managed";
|
||||
}
|
||||
|
||||
/** RegistryProvider mirrors one entry of GET /templates `registry_providers`
|
||||
* (workspace-server registryProviderView): the registry's native provider for
|
||||
* a runtime, with its display label, auth-env NAMES, and billing mode. This is
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
||||
import { render, screen, fireEvent, waitFor, cleanup } from "@testing-library/react";
|
||||
import { CreateWorkspaceButton } from "../CreateWorkspaceDialog";
|
||||
import { isPlatformManagedProvider } from "../ProviderModelSelector";
|
||||
|
||||
vi.mock("@/lib/api", () => ({
|
||||
api: {
|
||||
@@ -66,34 +65,6 @@ const SAMPLE_TEMPLATES = [
|
||||
{ id: "moonshot/kimi-k2.6", name: "Kimi K2.6", provider: "platform", required_env: [] },
|
||||
],
|
||||
},
|
||||
// #2245 fixtures. The real registry `platform` provider declares
|
||||
// MOLECULE_LLM_USAGE_TOKEN in its auth_env — the default mock above masks the
|
||||
// bug by using required_env:[]. This template gives the platform provider a
|
||||
// non-empty auth env (matching production) so the credential-suppression
|
||||
// logic is actually exercised.
|
||||
{
|
||||
id: "platform-managed-test",
|
||||
name: "Platform Managed Test",
|
||||
runtime: "claude-code",
|
||||
model: "moonshot/kimi-k2.6",
|
||||
providers: ["platform", "minimax"],
|
||||
models: [
|
||||
{ id: "moonshot/kimi-k2.6", name: "Kimi K2.6", provider: "platform", required_env: ["MOLECULE_LLM_USAGE_TOKEN"] },
|
||||
{ id: "MiniMax-M2.7", name: "MiniMax M2.7", required_env: ["MINIMAX_API_KEY"] },
|
||||
],
|
||||
},
|
||||
// BYOK-only template (no platform provider) — the credential requirement
|
||||
// MUST still hold for these (no-regression guard).
|
||||
{
|
||||
id: "byok-only-test",
|
||||
name: "BYOK Only Test",
|
||||
runtime: "claude-code",
|
||||
model: "openai/gpt-4o",
|
||||
providers: ["openai"],
|
||||
models: [
|
||||
{ id: "openai/gpt-4o", name: "GPT-4o", required_env: ["OPENAI_API_KEY"] },
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
beforeEach(() => {
|
||||
@@ -483,182 +454,6 @@ describe("CreateWorkspaceDialog — dynamic runtime provider picker", () => {
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Registry-backed provider catalog (RFC#340 Fix C)
|
||||
//
|
||||
// Regression guard for the mis-bucketing bug: when a registry-backed
|
||||
// claude-code template serves `moonshot/kimi-k2.6` whose DERIVED provider is
|
||||
// `platform`, the dialog must build the dropdown from registry_providers/
|
||||
// registry_models (buildProviderCatalogFromRegistry) — NOT the legacy
|
||||
// inferVendor heuristic which slash-splits the id into "moonshot". The
|
||||
// distinguishing trait of this fixture: the plain `models[]` array does NOT
|
||||
// carry an explicit `provider` field, so the LEGACY path would bucket the
|
||||
// model under "moonshot" and send llm_provider:"moonshot". Only the
|
||||
// registry-backed path yields the Platform bucket + llm_provider:"platform".
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// claude-code template whose plain models[] is UN-annotated (no explicit
|
||||
// provider). The derived-provider annotation lives ONLY in registry_models.
|
||||
const REGISTRY_TEMPLATE = {
|
||||
id: "claude-code-default",
|
||||
name: "Claude Code Agent",
|
||||
runtime: "claude-code",
|
||||
model: "moonshot/kimi-k2.6",
|
||||
// Legacy fields — note: NO explicit provider on the platform model, so the
|
||||
// legacy inferVendor path would slash-split it into "moonshot".
|
||||
providers: ["platform", "minimax", "anthropic"],
|
||||
models: [
|
||||
{ id: "moonshot/kimi-k2.6", name: "Kimi K2.6", required_env: [] },
|
||||
{ id: "MiniMax-M2.7", name: "MiniMax M2.7", required_env: ["MINIMAX_API_KEY"] },
|
||||
{ id: "claude-sonnet-4-6", name: "Claude Sonnet 4.6", required_env: ["ANTHROPIC_API_KEY"] },
|
||||
],
|
||||
// Registry-served SSOT (internal#718 P3). DeriveProvider resolved
|
||||
// moonshot/kimi-k2.6 → "platform"; MiniMax-M2.7 → "minimax".
|
||||
registry_backed: true,
|
||||
registry_providers: [
|
||||
{ name: "platform", display_name: "Platform", auth_env: [], billing_mode: "platform_managed" },
|
||||
{ name: "minimax", display_name: "MiniMax", auth_env: ["MINIMAX_API_KEY"], billing_mode: "byok" },
|
||||
{ name: "anthropic", display_name: "Anthropic API", auth_env: ["ANTHROPIC_API_KEY"], billing_mode: "byok" },
|
||||
],
|
||||
registry_models: [
|
||||
{ id: "moonshot/kimi-k2.6", name: "Kimi K2.6", provider: "platform", billing_mode: "platform_managed" },
|
||||
{ id: "MiniMax-M2.7", name: "MiniMax M2.7", provider: "minimax", billing_mode: "byok" },
|
||||
{ id: "claude-sonnet-4-6", name: "Claude Sonnet 4.6", provider: "anthropic", billing_mode: "byok" },
|
||||
],
|
||||
};
|
||||
|
||||
// Registry-backed platform provider WITH a non-empty auth_env — this matches
|
||||
// the PRODUCTION provider view, which ships the raw AuthEnv
|
||||
// ([MOLECULE_LLM_USAGE_TOKEN]). REGISTRY_TEMPLATE above uses auth_env:[] so it
|
||||
// never exercises suppression; this one drives the billingMode==="platform_
|
||||
// managed" branch end-to-end through buildProviderCatalogFromRegistry. (#2245)
|
||||
const REGISTRY_TEMPLATE_PLATFORM_AUTHENV = {
|
||||
...REGISTRY_TEMPLATE,
|
||||
registry_providers: [
|
||||
{
|
||||
name: "platform",
|
||||
display_name: "Platform",
|
||||
auth_env: ["MOLECULE_LLM_USAGE_TOKEN"],
|
||||
billing_mode: "platform_managed",
|
||||
},
|
||||
{ name: "minimax", display_name: "MiniMax", auth_env: ["MINIMAX_API_KEY"], billing_mode: "byok" },
|
||||
{ name: "anthropic", display_name: "Anthropic API", auth_env: ["ANTHROPIC_API_KEY"], billing_mode: "byok" },
|
||||
],
|
||||
};
|
||||
|
||||
describe("CreateWorkspaceDialog — registry-backed provider catalog (RFC#340 Fix C)", () => {
|
||||
beforeEach(() => {
|
||||
mockGet.mockImplementation(async (url: string) => {
|
||||
if (url === "/templates") {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
return [REGISTRY_TEMPLATE] as any;
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
return SAMPLE_WORKSPACES as any;
|
||||
});
|
||||
});
|
||||
|
||||
it("shows the Platform provider bucket for the registry-backed claude-code runtime", async () => {
|
||||
await openDialog();
|
||||
const providerSelect = await waitFor(() => {
|
||||
const sel = document.querySelector("[data-testid='provider-select']") as HTMLSelectElement;
|
||||
expect(sel).toBeTruthy();
|
||||
return sel;
|
||||
});
|
||||
const labels = Array.from(providerSelect.options).map((o) => o.text.trim());
|
||||
// Registry display_name "Platform" appears — NOT "moonshot" from the
|
||||
// legacy slash-split heuristic.
|
||||
expect(labels).toContain("Platform");
|
||||
expect(labels).not.toContain("moonshot");
|
||||
// Bucket id is the registry-keyed id, vendor is the bare provider name.
|
||||
const values = Array.from(providerSelect.options).map((o) => o.value);
|
||||
expect(values).toContain("registry|platform");
|
||||
});
|
||||
|
||||
it("sends llm_provider: platform (not moonshot) for moonshot/kimi-k2.6", async () => {
|
||||
await openDialog();
|
||||
fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
|
||||
target: { value: "Kimi Agent" },
|
||||
});
|
||||
// Wait for the registry default to settle on the Platform bucket + model.
|
||||
await waitFor(() => {
|
||||
const modelSelect = document.querySelector("[data-testid='model-select']") as HTMLSelectElement;
|
||||
expect(modelSelect?.value).toBe("moonshot/kimi-k2.6");
|
||||
});
|
||||
|
||||
const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
|
||||
fireEvent.click(createBtn!);
|
||||
|
||||
await waitFor(() => expect(mockPost).toHaveBeenCalled());
|
||||
const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
|
||||
expect(body.model).toBe("moonshot/kimi-k2.6");
|
||||
expect(body.llm_provider).toBe("platform");
|
||||
// Platform is auth-env-free → no BYOK secret.
|
||||
expect(body.secrets).toBeUndefined();
|
||||
});
|
||||
|
||||
it("buckets MiniMax-M2.7 under its derived provider and sends llm_provider: minimax", async () => {
|
||||
await openDialog();
|
||||
fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
|
||||
target: { value: "MiniMax Agent" },
|
||||
});
|
||||
await waitFor(() => {
|
||||
const sel = document.querySelector("[data-testid='provider-select']") as HTMLSelectElement;
|
||||
expect(Array.from(sel.options).map((o) => o.value)).toContain("registry|minimax");
|
||||
});
|
||||
fireEvent.change(document.querySelector("[data-testid='provider-select']") as HTMLSelectElement, {
|
||||
target: { value: "registry|minimax" },
|
||||
});
|
||||
fireEvent.change(document.getElementById("llm-secret-input") as HTMLInputElement, {
|
||||
target: { value: "sk-minimax-test" },
|
||||
});
|
||||
|
||||
const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
|
||||
fireEvent.click(createBtn!);
|
||||
|
||||
await waitFor(() => expect(mockPost).toHaveBeenCalled());
|
||||
const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
|
||||
expect(body.model).toBe("MiniMax-M2.7");
|
||||
expect(body.llm_provider).toBe("minimax");
|
||||
expect(body.secrets).toEqual({ MINIMAX_API_KEY: "sk-minimax-test" });
|
||||
});
|
||||
|
||||
it("suppresses the credential for a registry-backed platform provider that declares an auth_env — billingMode path (#2245)", async () => {
|
||||
// Override the default REGISTRY_TEMPLATE (auth_env:[]) with the production-
|
||||
// shaped one whose platform provider declares MOLECULE_LLM_USAGE_TOKEN.
|
||||
mockGet.mockImplementation(async (url: string) => {
|
||||
if (url === "/templates") {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
return [REGISTRY_TEMPLATE_PLATFORM_AUTHENV] as any;
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
return SAMPLE_WORKSPACES as any;
|
||||
});
|
||||
await openDialog();
|
||||
fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
|
||||
target: { value: "Registry Platform Agent" },
|
||||
});
|
||||
// Platform is the default bucket; even with a non-empty auth_env the key
|
||||
// field must NOT render (suppressed via billingMode==="platform_managed").
|
||||
await waitFor(() => {
|
||||
const sel = document.querySelector("[data-testid='provider-select']") as HTMLSelectElement;
|
||||
expect(sel?.value).toBe("registry|platform");
|
||||
});
|
||||
expect(screen.getByText("Platform-managed — no API key required.")).toBeTruthy();
|
||||
expect(document.getElementById("llm-secret-input")).toBeNull();
|
||||
|
||||
const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
|
||||
fireEvent.click(createBtn!);
|
||||
|
||||
await waitFor(() => expect(mockPost).toHaveBeenCalled());
|
||||
expect(screen.queryByText("Provider credential is required")).toBeNull();
|
||||
const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
|
||||
expect(body.llm_provider).toBe("platform");
|
||||
// The provisioner-injected MOLECULE_LLM_USAGE_TOKEN must NOT be clobbered.
|
||||
expect(body.secrets).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// budget_limit field tests (#541)
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -740,70 +535,3 @@ describe("CreateWorkspaceDialog — budget_limit field", () => {
|
||||
expect(budgetInput.value).toBe("");
|
||||
});
|
||||
});
|
||||
|
||||
describe("CreateWorkspaceDialog — platform-managed credential suppression (#2245)", () => {
|
||||
describe("isPlatformManagedProvider", () => {
|
||||
it("is true for the platform proxy vendor", () => {
|
||||
expect(isPlatformManagedProvider({ vendor: "platform" })).toBe(true);
|
||||
});
|
||||
it("is true for a registry billingMode of platform_managed", () => {
|
||||
expect(
|
||||
isPlatformManagedProvider({ vendor: "minimax", billingMode: "platform_managed" }),
|
||||
).toBe(true);
|
||||
});
|
||||
it("is false for a BYOK provider", () => {
|
||||
expect(isPlatformManagedProvider({ vendor: "anthropic", billingMode: "byok" })).toBe(false);
|
||||
expect(isPlatformManagedProvider({ vendor: "minimax" })).toBe(false);
|
||||
});
|
||||
it("is false for null/undefined", () => {
|
||||
expect(isPlatformManagedProvider(null)).toBe(false);
|
||||
expect(isPlatformManagedProvider(undefined)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
it("platform-managed provider with a declared auth env requires NO credential, hides the key field, and sends NO secret", async () => {
|
||||
await openDialog();
|
||||
await setTemplate("platform-managed-test");
|
||||
fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
|
||||
target: { value: "Platform Agent" },
|
||||
});
|
||||
|
||||
// The credential input must NOT render for platform-managed; a "no key
|
||||
// required" note appears instead.
|
||||
await waitFor(() =>
|
||||
expect(screen.getByText("Platform-managed — no API key required.")).toBeTruthy(),
|
||||
);
|
||||
expect(screen.queryByLabelText("MOLECULE_LLM_USAGE_TOKEN")).toBeNull();
|
||||
|
||||
const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
|
||||
fireEvent.click(createBtn!);
|
||||
|
||||
await waitFor(() => expect(mockPost).toHaveBeenCalled());
|
||||
// No validation error, and the provisioner-injected token is NOT clobbered
|
||||
// by an empty secret.
|
||||
expect(screen.queryByText("Provider credential is required")).toBeNull();
|
||||
const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
|
||||
expect(body.llm_provider).toBe("platform");
|
||||
expect(body.secrets).toBeUndefined();
|
||||
});
|
||||
|
||||
it("BYOK provider still requires a credential and renders the key field (no-regression)", async () => {
|
||||
await openDialog();
|
||||
await setTemplate("byok-only-test");
|
||||
fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
|
||||
target: { value: "BYOK Agent" },
|
||||
});
|
||||
|
||||
// The credential field IS rendered for BYOK...
|
||||
await waitFor(() => expect(screen.getByLabelText("OPENAI_API_KEY")).toBeTruthy());
|
||||
|
||||
const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
|
||||
fireEvent.click(createBtn!);
|
||||
|
||||
// ...and create stays blocked until it's filled.
|
||||
await waitFor(() =>
|
||||
expect(screen.getByText("Provider credential is required")).toBeTruthy(),
|
||||
);
|
||||
expect(mockPost).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"use client";
|
||||
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
import { useEffect, useRef, useState } from "react";
|
||||
import { api } from "@/lib/api";
|
||||
import type RFB from "@novnc/novnc";
|
||||
|
||||
@@ -33,11 +33,6 @@ export function DisplayTab({ workspaceId }: Props) {
|
||||
const [controlBusy, setControlBusy] = useState(false);
|
||||
const [sessionUrl, setSessionUrl] = useState<string | null>(null);
|
||||
const requestGeneration = useRef(0);
|
||||
// Freshest signed session URL (token bound to the lease's expires_at). The
|
||||
// renewal timer keeps this current WITHOUT swapping the live stream's
|
||||
// sessionUrl (which would needlessly reconnect the desktop); the stream uses
|
||||
// it only when it has to reconnect after an unclean drop.
|
||||
const latestSessionUrlRef = useRef<string | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
const generation = requestGeneration.current + 1;
|
||||
@@ -46,7 +41,6 @@ export function DisplayTab({ workspaceId }: Props) {
|
||||
setStatus(null);
|
||||
setControl(null);
|
||||
setSessionUrl(null);
|
||||
latestSessionUrlRef.current = null;
|
||||
setError(null);
|
||||
setControlError(null);
|
||||
setControlBusy(false);
|
||||
@@ -75,41 +69,6 @@ export function DisplayTab({ workspaceId }: Props) {
|
||||
};
|
||||
}, [workspaceId]);
|
||||
|
||||
// Acquire (or re-acquire) the display-control lease as the current holder.
|
||||
// Re-acquiring extends the 300s server-side lock AND returns a freshly-signed
|
||||
// session URL (token bound to the new expires_at). Used both to renew the
|
||||
// lease on a timer and to mint a non-stale token for each reconnect — a
|
||||
// cached URL can be past its ~300s expiry, which would make a reconnect 401.
|
||||
const reacquireSession = useCallback(async (): Promise<string | null> => {
|
||||
const generation = requestGeneration.current;
|
||||
try {
|
||||
const next = await api.post<DisplayControlStatus>(
|
||||
`/workspaces/${workspaceId}/display/control/acquire`,
|
||||
{ controller: "user", ttl_seconds: 300 },
|
||||
);
|
||||
if (requestGeneration.current !== generation) return null;
|
||||
setControl(next);
|
||||
if (next.session_url) latestSessionUrlRef.current = next.session_url;
|
||||
return next.session_url ?? null;
|
||||
} catch {
|
||||
// Transient failure, or another holder took over: the live stream keeps
|
||||
// running on its existing connection; a reconnect re-evaluates control.
|
||||
return null;
|
||||
}
|
||||
}, [workspaceId]);
|
||||
|
||||
// Renew the lease while we hold it. The lock is a 300s lease with no
|
||||
// server-side auto-renewal, so without this the control (and the session
|
||||
// token) silently expire mid-session — the user appears "kicked" every ~5
|
||||
// minutes. We renew well inside the TTL and do not touch the live stream.
|
||||
useEffect(() => {
|
||||
if (!sessionUrl) return;
|
||||
const timer = setInterval(() => {
|
||||
void reacquireSession();
|
||||
}, 120_000);
|
||||
return () => clearInterval(timer);
|
||||
}, [sessionUrl, reacquireSession]);
|
||||
|
||||
const acquireControl = async () => {
|
||||
const generation = requestGeneration.current;
|
||||
const controlPath = `/workspaces/${workspaceId}/display/control`;
|
||||
@@ -123,7 +82,6 @@ export function DisplayTab({ workspaceId }: Props) {
|
||||
if (requestGeneration.current !== generation) return;
|
||||
setControl(next);
|
||||
setSessionUrl(next.session_url || null);
|
||||
latestSessionUrlRef.current = next.session_url || null;
|
||||
} catch (err) {
|
||||
if (requestGeneration.current !== generation) return;
|
||||
setControlError("Failed to take control");
|
||||
@@ -150,7 +108,6 @@ export function DisplayTab({ workspaceId }: Props) {
|
||||
if (requestGeneration.current !== generation) return;
|
||||
setControl(next);
|
||||
setSessionUrl(null);
|
||||
latestSessionUrlRef.current = null;
|
||||
} catch (err) {
|
||||
if (requestGeneration.current !== generation) return;
|
||||
setControlError("Failed to release control");
|
||||
@@ -278,11 +235,7 @@ export function DisplayTab({ workspaceId }: Props) {
|
||||
/>
|
||||
</div>
|
||||
{sessionUrl ? (
|
||||
<DesktopStream
|
||||
sessionUrl={sessionUrl}
|
||||
latestSessionUrlRef={latestSessionUrlRef}
|
||||
reacquireSession={reacquireSession}
|
||||
/>
|
||||
<DesktopStream sessionUrl={sessionUrl} />
|
||||
) : (
|
||||
<div className="flex flex-1 items-center justify-center p-8 text-center">
|
||||
<div>
|
||||
@@ -358,15 +311,7 @@ function DisplayControlBar({
|
||||
);
|
||||
}
|
||||
|
||||
function DesktopStream({
|
||||
sessionUrl,
|
||||
latestSessionUrlRef,
|
||||
reacquireSession,
|
||||
}: {
|
||||
sessionUrl: string;
|
||||
latestSessionUrlRef: { current: string | null };
|
||||
reacquireSession: () => Promise<string | null>;
|
||||
}) {
|
||||
function DesktopStream({ sessionUrl }: { sessionUrl: string }) {
|
||||
const containerRef = useRef<HTMLDivElement | null>(null);
|
||||
const rfbRef = useRef<RFB | null>(null);
|
||||
const [streamError, setStreamError] = useState<string | null>(null);
|
||||
@@ -384,37 +329,20 @@ function DesktopStream({
|
||||
clipboardTimer = setTimeout(() => setClipboardStatus(null), 2500);
|
||||
};
|
||||
|
||||
let attempts = 0;
|
||||
let retryTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
const maxAttempts = 10;
|
||||
|
||||
async function connect(reacquire = false) {
|
||||
async function connect() {
|
||||
setStreamError(null);
|
||||
try {
|
||||
// On a reconnect, mint a fresh lease + token first — the original token
|
||||
// is only ~300s, so a cached URL can be expired and would 401. The
|
||||
// initial connect already holds a fresh token from acquireControl.
|
||||
if (reacquire) await reacquireSession();
|
||||
const mod = await import("@novnc/novnc");
|
||||
if (cancelled || !containerRef.current) return;
|
||||
const stream = displayWebSocketConnection(latestSessionUrlRef.current || sessionUrl);
|
||||
const stream = displayWebSocketConnection(sessionUrl);
|
||||
rfb = new mod.default(containerRef.current, stream.url, {
|
||||
wsProtocols: ["binary", `molecule-display-token.${stream.token}`],
|
||||
});
|
||||
rfbRef.current = rfb;
|
||||
rfb.scaleViewport = true;
|
||||
// Do NOT request a server-side resize: the workspace display runs a
|
||||
// fixed Xorg modeline and x11vnc rejects SetDesktopSize ("Resize is
|
||||
// administratively prohibited"), which spams the console on every
|
||||
// (re)connect. scaleViewport already fits the fixed framebuffer to the
|
||||
// container client-side, so we don't need the server to resize.
|
||||
rfb.resizeSession = false;
|
||||
rfb.resizeSession = true;
|
||||
rfb.focusOnClick = true;
|
||||
rfb.focus({ preventScroll: true });
|
||||
rfb.addEventListener("connect", () => {
|
||||
attempts = 0;
|
||||
if (!cancelled) setStreamError(null);
|
||||
});
|
||||
rfb.addEventListener("clipboard", (event: Event) => {
|
||||
const text = (event as CustomEvent<{ text?: string }>).detail?.text ?? "";
|
||||
if (!text) return;
|
||||
@@ -425,20 +353,7 @@ function DesktopStream({
|
||||
});
|
||||
rfb.addEventListener("disconnect", (event: Event) => {
|
||||
const detail = (event as CustomEvent<{ clean?: boolean }>).detail;
|
||||
rfbRef.current = null;
|
||||
if (cancelled || detail?.clean) return;
|
||||
// Auto-reconnect after an unclean drop (idle/network blip, brief
|
||||
// agent hiccup); bounded backoff so a genuinely-dead session still
|
||||
// surfaces an error instead of looping forever.
|
||||
if (attempts < maxAttempts) {
|
||||
attempts += 1;
|
||||
setStreamError(`Reconnecting to desktop… (attempt ${attempts})`);
|
||||
retryTimer = setTimeout(() => {
|
||||
if (!cancelled) void connect(true);
|
||||
}, Math.min(1000 * attempts, 5000));
|
||||
} else {
|
||||
setStreamError("Desktop stream disconnected.");
|
||||
}
|
||||
if (!cancelled && !detail?.clean) setStreamError("Desktop stream disconnected.");
|
||||
});
|
||||
} catch {
|
||||
if (!cancelled) setStreamError("Desktop stream could not be opened.");
|
||||
@@ -448,12 +363,11 @@ function DesktopStream({
|
||||
connect();
|
||||
return () => {
|
||||
cancelled = true;
|
||||
if (retryTimer) clearTimeout(retryTimer);
|
||||
if (clipboardTimer) clearTimeout(clipboardTimer);
|
||||
rfbRef.current = null;
|
||||
rfb?.disconnect();
|
||||
};
|
||||
}, [sessionUrl, reacquireSession, latestSessionUrlRef]);
|
||||
}, [sessionUrl]);
|
||||
|
||||
useEffect(() => {
|
||||
const onPaste = (event: ClipboardEvent) => {
|
||||
|
||||
@@ -2,13 +2,12 @@
|
||||
import { describe, it, expect, vi, beforeEach } from "vitest";
|
||||
import { cleanup, fireEvent, render, screen, waitFor } from "@testing-library/react";
|
||||
|
||||
const { mockGet, mockPost, mockRFBConstructor, mockRFBClipboardPasteFrom, mockRFBFocus, rfbInstances } = vi.hoisted(() => ({
|
||||
const { mockGet, mockPost, mockRFBConstructor, mockRFBClipboardPasteFrom, mockRFBFocus } = vi.hoisted(() => ({
|
||||
mockGet: vi.fn(),
|
||||
mockPost: vi.fn(),
|
||||
mockRFBConstructor: vi.fn(),
|
||||
mockRFBClipboardPasteFrom: vi.fn(),
|
||||
mockRFBFocus: vi.fn(),
|
||||
rfbInstances: [] as EventTarget[],
|
||||
}));
|
||||
|
||||
vi.mock("@/lib/api", () => ({
|
||||
@@ -32,7 +31,6 @@ vi.mock("@novnc/novnc", () => ({
|
||||
this.url = url;
|
||||
this.options = options;
|
||||
mockRFBConstructor(target, url, options);
|
||||
rfbInstances.push(this);
|
||||
}
|
||||
clipboardPasteFrom(text: string) {
|
||||
mockRFBClipboardPasteFrom(text);
|
||||
@@ -54,7 +52,6 @@ describe("DisplayTab", () => {
|
||||
mockRFBConstructor.mockReset();
|
||||
mockRFBClipboardPasteFrom.mockReset();
|
||||
mockRFBFocus.mockReset();
|
||||
rfbInstances.length = 0;
|
||||
});
|
||||
|
||||
it("renders unavailable state for non-display workspaces", async () => {
|
||||
@@ -403,62 +400,6 @@ describe("DisplayTab", () => {
|
||||
});
|
||||
expect(screen.getByRole("button", { name: "Take control" })).toBeTruthy();
|
||||
});
|
||||
|
||||
it("auto-reconnects the desktop stream after an unclean disconnect but not a clean one", async () => {
|
||||
mockGet
|
||||
.mockResolvedValueOnce({
|
||||
available: true,
|
||||
mode: "desktop-control",
|
||||
protocol: "novnc",
|
||||
width: 1920,
|
||||
height: 1080,
|
||||
})
|
||||
.mockResolvedValueOnce({ controller: "none" });
|
||||
// Initial acquire returns token "signed"; the reconnect re-acquire mints a
|
||||
// FRESH token "signed2" (the lock/token is only ~300s — reconnecting with a
|
||||
// cached, possibly-expired token would 401 and never recover).
|
||||
mockPost
|
||||
.mockResolvedValueOnce({
|
||||
controller: "user",
|
||||
controlled_by: "admin-token",
|
||||
expires_at: "2026-05-23T08:48:27Z",
|
||||
session_url: "/workspaces/ws-display/display/session/websockify#token=signed",
|
||||
})
|
||||
.mockResolvedValue({
|
||||
controller: "user",
|
||||
controlled_by: "admin-token",
|
||||
expires_at: "2026-05-23T08:53:27Z",
|
||||
session_url: "/workspaces/ws-display/display/session/websockify#token=signed2",
|
||||
});
|
||||
|
||||
render(<DisplayTab workspaceId="ws-display" />);
|
||||
await waitFor(() => {
|
||||
expect(screen.getByRole("button", { name: "Take control" })).toBeTruthy();
|
||||
});
|
||||
fireEvent.click(screen.getByRole("button", { name: "Take control" }));
|
||||
await waitFor(() => {
|
||||
expect(rfbInstances.length).toBe(1);
|
||||
});
|
||||
expect(mockRFBConstructor.mock.calls[0][2].wsProtocols).toContain("molecule-display-token.signed");
|
||||
|
||||
// An idle/network drop closes the websocket uncleanly. The client must
|
||||
// re-acquire a fresh token and reconnect instead of giving up — this is the
|
||||
// "disconnects every ~5 min and stays dead" report.
|
||||
rfbInstances[0].dispatchEvent(new CustomEvent("disconnect", { detail: { clean: false } }));
|
||||
await waitFor(
|
||||
() => {
|
||||
expect(rfbInstances.length).toBe(2);
|
||||
},
|
||||
{ timeout: 3000 },
|
||||
);
|
||||
// Reconnect dialed with the FRESH token, not the stale original.
|
||||
expect(mockRFBConstructor.mock.calls[1][2].wsProtocols).toContain("molecule-display-token.signed2");
|
||||
|
||||
// A clean disconnect (the user released control) must NOT reconnect.
|
||||
rfbInstances[1].dispatchEvent(new CustomEvent("disconnect", { detail: { clean: true } }));
|
||||
await new Promise((resolve) => setTimeout(resolve, 1100));
|
||||
expect(rfbInstances.length).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
function deferred<T>() {
|
||||
|
||||
+8
-25
@@ -159,28 +159,15 @@ services:
|
||||
|
||||
# --- Canvas ---
|
||||
canvas:
|
||||
# The publish-canvas-image CI workflow runs an ORDERED deploy (core#2226):
|
||||
# build → push :staging-<sha> + :staging-latest → (after green main CI)
|
||||
# re-point :latest to the verified :staging-<sha> by digest. So both tags
|
||||
# below resolve to a CI-green, reproducible build, never a raw/red one.
|
||||
#
|
||||
# Reproducible deploy: pin CANVAS_IMAGE_TAG to the immutable per-commit tag
|
||||
# the ordered deploy produced, e.g.
|
||||
# CANVAS_IMAGE_TAG=staging-<sha> docker compose pull canvas && docker compose up -d canvas
|
||||
# This makes a tenant/host deploy reproducible (resolves the standing
|
||||
# `TODO: pin canvas ECR image digest`). Unset it and the default `latest`
|
||||
# is the prod-blessed tag the ordered deploy keeps pointed at the last
|
||||
# green build — still deterministic vs. the old raw `:latest`.
|
||||
#
|
||||
# To pin by content digest instead of tag (fully immutable):
|
||||
# aws ecr describe-images --repository-name molecule-ai/canvas \
|
||||
# --image-tags staging-<sha> --region us-east-2 \
|
||||
# --query 'imageDetails[0].imageDigest' --output text
|
||||
# then set CANVAS_IMAGE_TAG=staging-<sha>@<digest> (compose passes it through).
|
||||
#
|
||||
# The publish-canvas-image CI workflow pushes a fresh image to GHCR on
|
||||
# every canvas/** merge to main. To update the running container:
|
||||
# docker compose pull canvas && docker compose up -d canvas
|
||||
# First-time local setup or testing unreleased changes — build from source:
|
||||
# docker compose build canvas && docker compose up -d canvas
|
||||
# Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull.
|
||||
# Local dev keeps working via the `build:` context below (docker compose build canvas).
|
||||
image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:${CANVAS_IMAGE_TAG:-latest}
|
||||
# Digest-pin requires: aws ecr describe-images --repository-name molecule-ai/canvas --image-tags latest --query 'imageDetails[0].imageDigest'
|
||||
# TODO: pin canvas ECR image digest once AWS creds are available in CI.
|
||||
image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest
|
||||
build:
|
||||
context: ./canvas
|
||||
dockerfile: Dockerfile
|
||||
@@ -188,10 +175,6 @@ services:
|
||||
NEXT_PUBLIC_PLATFORM_URL: ${NEXT_PUBLIC_PLATFORM_URL:-http://localhost:${PLATFORM_PUBLISH_PORT:-8080}}
|
||||
NEXT_PUBLIC_WS_URL: ${NEXT_PUBLIC_WS_URL:-ws://localhost:${PLATFORM_PUBLISH_PORT:-8080}/ws}
|
||||
NEXT_PUBLIC_ADMIN_TOKEN: ${ADMIN_TOKEN:-}
|
||||
# SHA surfaced at /api/buildinfo (core#2235). CI passes the real merge
|
||||
# SHA via the publish-canvas-image workflow build-args; local compose
|
||||
# builds default to "dev" (the route's unwired sentinel).
|
||||
BUILD_SHA: ${BUILD_SHA:-dev}
|
||||
depends_on:
|
||||
platform:
|
||||
condition: service_healthy
|
||||
|
||||
@@ -114,7 +114,7 @@ Opt-in pattern: when `idle_prompt` is non-empty in `config.yaml`, the workspace
|
||||
|
||||
Three Gin middleware classes gate server-side routes. Full contract in `docs/runbooks/admin-auth.md`.
|
||||
|
||||
- **`middleware.AdminAuth(db.DB)`** — strict bearer-only and **fail-closed in every environment** (harden/no-fail-open-auth). Used for any route where a forged request could leak prompts/memory, create/mutate workspaces, or leak ops intel. The former lazy-bootstrap fail-open (pass when `HasAnyLiveTokenGlobal` returns 0) and the dev-mode escape hatch have both been removed — a fresh install must provision `ADMIN_TOKEN` to reach admin routes.
|
||||
- **`middleware.AdminAuth(db.DB)`** — strict bearer-only. Used for any route where a forged request could leak prompts/memory, create/mutate workspaces, or leak ops intel. Lazy-bootstrap fail-open when `HasAnyLiveTokenGlobal` returns 0.
|
||||
- **`middleware.CanvasOrBearer(db.DB)`** — accepts a bearer token OR an Origin matching `CORS_ORIGINS`. Used **only** for cosmetic routes where a forged request has zero data/security impact. Currently only on `PUT /canvas/viewport`. Do not extend this to any route that leaks data or creates resources — see the runbook.
|
||||
- **`middleware.WorkspaceAuth(db.DB)`** — binds a bearer token to `:id`. Workspace A's token cannot hit workspace B's sub-routes. Used for the entire `/workspaces/:id/*` group except the A2A proxy (which has its own `CanCommunicate` layer).
|
||||
|
||||
|
||||
+3
-9
@@ -24,7 +24,7 @@ cd molecule-core
|
||||
|
||||
That single script:
|
||||
|
||||
1. Generates an `ADMIN_TOKEN` into `.env` (first run only — preserved on re-runs) and exports the matching `NEXT_PUBLIC_ADMIN_TOKEN` so the canvas authenticates with it. Auth is **fail-closed in every environment** (including local dev) — there is no dev-mode fail-open; the canvas reaches admin/workspace routes only because it sends this bearer.
|
||||
1. Generates an `ADMIN_TOKEN` into `.env` (first run only — preserved on re-runs)
|
||||
2. Brings up Postgres, Redis, Langfuse, ClickHouse, and Temporal via `infra/scripts/setup.sh`
|
||||
3. Populates the workspace template + plugin registry from `manifest.json`
|
||||
4. Builds and starts the platform on `http://localhost:8080`
|
||||
@@ -62,17 +62,11 @@ If you only want the raw compose flow:
|
||||
docker compose -f docker-compose.infra.yml up -d
|
||||
```
|
||||
|
||||
> **Auth is fail-closed even in local dev.** Pick any local admin token and
|
||||
> set it on *both* sides — the platform (`ADMIN_TOKEN`) and the canvas
|
||||
> (`NEXT_PUBLIC_ADMIN_TOKEN`, same value). Without it the canvas 401s on every
|
||||
> admin/workspace call. (`scripts/dev-start.sh` does this for you; the manual
|
||||
> steps below set it explicitly.)
|
||||
|
||||
### Step 3: Start the platform
|
||||
|
||||
```bash
|
||||
cd workspace-server
|
||||
ADMIN_TOKEN=dev-local-admin-token MOLECULE_ENV=development go run ./cmd/server
|
||||
go run ./cmd/server
|
||||
```
|
||||
|
||||
The control plane listens on `http://localhost:8080`.
|
||||
@@ -84,7 +78,7 @@ In a new terminal:
|
||||
```bash
|
||||
cd canvas
|
||||
npm install
|
||||
NEXT_PUBLIC_ADMIN_TOKEN=dev-local-admin-token npm run dev # MUST match ADMIN_TOKEN above
|
||||
npm run dev
|
||||
```
|
||||
|
||||
Open `http://localhost:3000`.
|
||||
|
||||
@@ -1,29 +1,5 @@
|
||||
# Admin Authentication Runbook
|
||||
|
||||
## Auth is fail-CLOSED in every environment — `ADMIN_TOKEN` is the bootstrap credential
|
||||
|
||||
Per the CTO "nothing should be fail-open" directive, **every** auth path on the
|
||||
workspace-server fails closed — there is no dev-mode / zero-token / DB-outage
|
||||
hatch that grants access. This includes:
|
||||
|
||||
- `AdminAuth` and `WorkspaceAuth` (admin + per-workspace routes),
|
||||
- `CanvasOrBearer` (the cosmetic `PUT /canvas/viewport` route), and
|
||||
- `validateDiscoveryCaller` (`/registry/:id/peers`, `/registry/discover/:id`).
|
||||
|
||||
Consequence for **bootstrap**: a brand-new self-hosted / dev install has **no
|
||||
DB-backed tokens yet**, and there is no longer a fail-open that lets the first
|
||||
request through. The **only** way to reach admin routes (and to mint the first
|
||||
workspace token via `POST /admin/workspaces/:id/tokens`) is to set `ADMIN_TOKEN`
|
||||
in the platform environment and present it as the bearer. This is the "local
|
||||
mimics production" principle: there is no zero-config bootstrap.
|
||||
|
||||
- **Local dev:** `scripts/dev-start.sh` provisions a deterministic
|
||||
`ADMIN_TOKEN` into `.env` (and exports the matching `NEXT_PUBLIC_ADMIN_TOKEN`
|
||||
so the canvas authenticates with it). See `docs/quickstart.md`.
|
||||
- **Self-hosted / SaaS:** set `ADMIN_TOKEN` to a strong random secret
|
||||
(`openssl rand -base64 32`) in the platform env and bake the matching
|
||||
`NEXT_PUBLIC_ADMIN_TOKEN` into the canvas bundle.
|
||||
|
||||
## Required: set `MOLECULE_ENV` in all non-dev environments
|
||||
|
||||
```bash
|
||||
@@ -31,10 +7,8 @@ mimics production" principle: there is no zero-config bootstrap.
|
||||
MOLECULE_ENV=production
|
||||
```
|
||||
|
||||
This matches the production tenant default. NOTE: `MOLECULE_ENV` no longer gates
|
||||
any auth decision — it only drives NON-security local-dev conveniences (loopback
|
||||
bind, relaxed rate limit). Setting it to `dev`/`development` does **not** relax
|
||||
authentication. Staging and production smoke tests should use the real user/API
|
||||
This matches the production tenant default and disables development-only
|
||||
shortcuts. Staging and production smoke tests should use the real user/API
|
||||
workflow: create a workspace, then mint a one-time displayed workspace bearer
|
||||
with `POST /admin/workspaces/:id/tokens`.
|
||||
|
||||
@@ -49,7 +23,5 @@ The platform uses `ADMIN_TOKEN` as the bearer credential for admin-gated endpoin
|
||||
| `POST /org/import` | `Authorization: Bearer <ADMIN_TOKEN>` |
|
||||
| `POST /admin/workspaces/:id/tokens` | `Authorization: Bearer <ADMIN_TOKEN>`; plaintext token returned once |
|
||||
|
||||
Missing or invalid bearer → **401 in every environment** (fail-closed; no
|
||||
dev-mode fail-open). If the auth datastore is unreachable, auth-gated routes
|
||||
return **503** (`platform_unavailable`) — an availability tradeoff that grants no
|
||||
access — rather than allowing the request through.
|
||||
Missing or invalid `ADMIN_TOKEN` → AdminAuth fails open in dev mode (no token set), or
|
||||
returns 401 in production mode (token set but invalid).
|
||||
|
||||
+5
-1
@@ -28,10 +28,14 @@
|
||||
{"name": "claude-code-default", "repo": "molecule-ai/molecule-ai-workspace-template-claude-code", "ref": "main"},
|
||||
{"name": "hermes", "repo": "molecule-ai/molecule-ai-workspace-template-hermes", "ref": "main"},
|
||||
{"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
|
||||
{"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"}
|
||||
{"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
|
||||
{"name": "google-adk", "repo": "molecule-ai/molecule-ai-workspace-template-google-adk", "ref": "main"},
|
||||
{"name": "seo-agent", "repo": "molecule-ai/molecule-ai-workspace-template-seo-agent", "ref": "main"}
|
||||
],
|
||||
"org_templates": [
|
||||
{"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
|
||||
{"name": "free-beats-all", "repo": "molecule-ai/molecule-ai-org-template-free-beats-all", "ref": "main"},
|
||||
{"name": "medo-smoke", "repo": "molecule-ai/molecule-ai-org-template-medo-smoke", "ref": "main"},
|
||||
{"name": "molecule-worker-gemini", "repo": "molecule-ai/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
|
||||
{"name": "ux-ab-lab", "repo": "molecule-ai/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
|
||||
]
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
# Developer SOP — PR review gate auto-fire and stale-head handling
|
||||
|
||||
> Last updated: 2026-06-03 (cp#2159 follow-up)
|
||||
>
|
||||
> Applies to: all core-PR authors and reviewers on `molecule-core` and sibling
|
||||
> repos using the `qa-review` + `security-review` branch-protection gates.
|
||||
|
||||
---
|
||||
|
||||
## 1. Gitea PR-head workflow-selection rule
|
||||
|
||||
**Rule:** For `pull_request_target` and `pull_request_review` events, Gitea
|
||||
loads the workflow definition from the **PR's HEAD branch**, not from the
|
||||
base (`main`) branch.
|
||||
|
||||
This is different from GitHub Actions, where `pull_request_target` always
|
||||
loads workflows from the base branch. Gitea's behaviour means:
|
||||
|
||||
- A PR that was opened **before** the `pull_request_review` trigger was added
|
||||
to `qa-review.yml` / `security-review.yml` will **NOT** auto-fire on review,
|
||||
because its HEAD still contains the old workflow YAML (no trigger).
|
||||
|
||||
- A PR that was opened **after** the trigger was added (or that has been
|
||||
rebased onto a commit containing the trigger) **WILL** auto-fire, because its
|
||||
HEAD contains the new workflow YAML.
|
||||
|
||||
### Ops implication
|
||||
|
||||
| PR head contains `pull_request_review` trigger? | Behaviour on APPROVED review |
|
||||
|---|---|
|
||||
| **Yes** (cut from current main, or rebased) | Workflows auto-queue, evaluate, and POST the `(pull_request_target)` context automatically. No slash-command needed. |
|
||||
| **No** (stale head, opened before #2157) | Nothing fires. Use `/qa-recheck` + `/security-recheck` slash-commands in a PR comment, OR rebase onto current main. |
|
||||
|
||||
---
|
||||
|
||||
## 2. Standard core-PR flow (post-#2157)
|
||||
|
||||
```
|
||||
1. Author opens PR from a branch based on current main
|
||||
→ qa-review + security-review workflows run on pull_request_target
|
||||
→ status contexts post (initial eval, usually red until reviews land)
|
||||
|
||||
2. Reviewers submit real APPROVED reviews
|
||||
→ If PR head has the trigger: workflows AUTO-FIRE on pull_request_review
|
||||
→ Contexts flip green (or stay red if reviewer is not in team)
|
||||
|
||||
3. [Optional] If contexts did not flip (stale head, event lost, etc.):
|
||||
→ Anyone can comment `/qa-recheck` or `/security-recheck`
|
||||
→ sop-checklist.yml refires the evaluator (read-only, idempotent)
|
||||
|
||||
4. Both qa-review + security-review contexts are green
|
||||
→ Plain Do:merge (no force-merge needed)
|
||||
```
|
||||
|
||||
### Key point
|
||||
|
||||
The `/qa-recheck` and `/security-recheck` commands are a **backstop**, not the
|
||||
primary path. PRs cut from current main should auto-fire without manual
|
||||
intervention.
|
||||
|
||||
---
|
||||
|
||||
## 3. Diagnosing a stale head
|
||||
|
||||
If a PR has real team-member APPROVED reviews but the qa/security contexts
|
||||
remain red and no workflow run appears on the PR's "Actions" tab for the
|
||||
review event, the PR head is likely stale.
|
||||
|
||||
### Quick check
|
||||
|
||||
```bash
|
||||
# From the PR page, look at the head commit SHA, then:
|
||||
curl -sS "https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-core/contents/.gitea/workflows/qa-review.yml?ref=<HEAD_SHA>" \
|
||||
| jq -r '.content' | base64 -d | grep -c 'pull_request_review'
|
||||
# 0 → stale head (no trigger in that version of the workflow)
|
||||
# >0 → trigger present; auto-fire SHOULD work (if it didn't, file a tracker)
|
||||
```
|
||||
|
||||
### Automated diagnostic
|
||||
|
||||
The test suite includes `test_gate_stale_head_diagnostic.py`, which reports
|
||||
"auto-fire impossible for this PR" when the head lacks the trigger. Run it
|
||||
in CI or locally with:
|
||||
|
||||
```bash
|
||||
PR_NUMBER=123 python -m pytest .gitea/scripts/tests/test_gate_stale_head_diagnostic.py -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Rebasing vs. slash-refire
|
||||
|
||||
| Approach | When to use | Trade-off |
|
||||
|---|---|---|
|
||||
| **Rebase onto current main** | PR is genuinely stale (head lacks trigger OR head is far behind main) | Clean history, gets all recent fixes, but requires force-push and re-approval if the branch was protected |
|
||||
| **`/qa-recheck` + `/security-recheck`** | PR head is recent but the review event was missed, or you want to avoid rebase churn | Quick, no force-push, but does NOT fix a missing trigger in the head |
|
||||
|
||||
**Do not** use slash-refire as a substitute for rebasing a stale head. If the
|
||||
workflow YAML in the PR head does not contain `pull_request_review`, no amount
|
||||
of rechecking will make auto-fire work.
|
||||
|
||||
---
|
||||
|
||||
## 5. Live-fire verification
|
||||
|
||||
The `test_gate_auto_fire_live.py` regression test exercises the full runtime
|
||||
path: it submits an APPROVED review to a test PR and polls for the
|
||||
`(pull_request_target)` status contexts. It is skipped when no API token is
|
||||
available, and is intended to catch runtime non-fire that static structural
|
||||
tests (e.g. `test_gate_review_auto_fire.py`) cannot detect.
|
||||
|
||||
Run manually with:
|
||||
|
||||
```bash
|
||||
export GITEA_HOST=git.moleculesai.app
|
||||
export GITEA_TOKEN=<your-token>
|
||||
export REPO=molecule-ai/molecule-core
|
||||
export LIVEFIRE_PR_NUMBER=<test-pr-number>
|
||||
python -m pytest .gitea/scripts/tests/test_gate_auto_fire_live.py -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- #2159 — gate auto-trigger not firing (root cause: stale PR heads lacking
|
||||
the `pull_request_review` trigger, NOT a workflow code defect)
|
||||
- #765 — static structural regression test for gate configuration
|
||||
- #2157 — merged trigger addition (`pull_request_review` types: [submitted])
|
||||
- #2020 — milestone confirming gate infrastructure is stable
|
||||
- RFC#324 — qa-review + security-review design
|
||||
@@ -1,76 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# check-manifest-repos-exist.sh — fail-fast guard: verify every repo listed in
|
||||
# manifest.json actually exists on Gitea before the expensive clone step runs.
|
||||
#
|
||||
# WHY: deleting an org-template/workspace-template repo that is still listed in
|
||||
# manifest.json breaks clone-manifest.sh with a generic git 404 error. The
|
||||
# failure is deep in the publish-workspace-server-image workflow and looks like
|
||||
# a transient network issue, wasting debug time. This script surfaces the
|
||||
# problem immediately with a per-entry ::error:: annotation naming the missing
|
||||
# repo (issue #2192).
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/check-manifest-repos-exist.sh <manifest.json>
|
||||
#
|
||||
# Exit:
|
||||
# 0 all repos exist
|
||||
# 1 one or more repos 404 (printed to stderr)
|
||||
# 2 bad usage / missing inputs
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MANIFEST="${1:-manifest.json}"
|
||||
GITEA_API="${GITEA_API:-https://git.moleculesai.app/api/v1/repos}"
|
||||
|
||||
if [ ! -f "$MANIFEST" ]; then
|
||||
echo "::error::manifest not found: $MANIFEST" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Strip JSON5-style // comments before parsing (same as clone-manifest.sh)
|
||||
_strip_comments() {
|
||||
sed 's/^[[:space:]]*\/\/.*//' "$MANIFEST"
|
||||
}
|
||||
|
||||
MANIFEST_JSON="$(_strip_comments)"
|
||||
|
||||
MISSING=0
|
||||
TOTAL=0
|
||||
|
||||
# Categories to check — must match clone-manifest.sh categories
|
||||
check_category() {
|
||||
local category="$1"
|
||||
local count
|
||||
count=$(echo "$MANIFEST_JSON" | jq -r ".${category} | length")
|
||||
|
||||
local i=0
|
||||
while [ "$i" -lt "$count" ]; do
|
||||
local name repo
|
||||
name=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].name")
|
||||
repo=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].repo")
|
||||
TOTAL=$((TOTAL + 1))
|
||||
|
||||
# Check repo existence via Gitea API (public endpoint, no auth needed)
|
||||
http_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "${GITEA_API}/${repo}" 2>/dev/null || true)
|
||||
|
||||
if [ "$http_code" != "200" ]; then
|
||||
echo "::error::manifest.json ${category} entry '${name}' → repo '${repo}' returned HTTP ${http_code} (expected 200). Delete the manifest entry BEFORE deleting the repo." >&2
|
||||
MISSING=$((MISSING + 1))
|
||||
fi
|
||||
|
||||
i=$((i + 1))
|
||||
done
|
||||
}
|
||||
|
||||
echo "==> Checking manifest repo existence against ${GITEA_API} ..."
|
||||
check_category "plugins"
|
||||
check_category "workspace_templates"
|
||||
check_category "org_templates"
|
||||
|
||||
if [ "$MISSING" -gt 0 ]; then
|
||||
echo "::error::${MISSING}/${TOTAL} manifest entries are missing — fix manifest.json before publishing." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ All ${TOTAL} manifest entries resolved (HTTP 200)."
|
||||
exit 0
|
||||
+22
-51
@@ -46,67 +46,46 @@ cleanup() {
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
# ─────────────────────────────────────────────── 1. dev-mode auth posture
|
||||
|
||||
# The AdminAuth middleware closes its fail-open the moment the first
|
||||
# workspace token lands in the DB — at which point /workspaces and
|
||||
# other admin routes 401 unless the caller has either ADMIN_TOKEN or
|
||||
# the dev-mode escape hatch. The canvas at localhost:3000 has no
|
||||
# bearer token to send, so without one of those two paths it can't
|
||||
# call admin endpoints after a workspace exists.
|
||||
#
|
||||
# SECURITY (harden/no-fail-open-auth): the workspace-server auth chain is
|
||||
# now fail-CLOSED in EVERY environment, dev included. There is NO dev-mode
|
||||
# fail-open escape hatch anymore — AdminAuth / WorkspaceAuth / discovery all
|
||||
# require a real credential. So local dev must AUTHENTICATE, not run open.
|
||||
# For local dev the right posture is the dev-mode escape hatch:
|
||||
#
|
||||
# The clean way to keep the canvas working locally is to provision a
|
||||
# deterministic ADMIN_TOKEN and hand the matching NEXT_PUBLIC_ADMIN_TOKEN to
|
||||
# the canvas bundle. The canvas already attaches `Authorization: Bearer
|
||||
# $NEXT_PUBLIC_ADMIN_TOKEN` on every platform call (canvas/src/lib/api.ts),
|
||||
# and next.config.ts warns if the pair is half-set. We set BOTH here.
|
||||
# MOLECULE_ENV=development AND ADMIN_TOKEN unset
|
||||
#
|
||||
# MOLECULE_ENV=development — dev conveniences (loopback bind, relaxed
|
||||
# rate limit). NOT an auth lever.
|
||||
# ADMIN_TOKEN=<dev value> — server-side bearer AdminAuth/WorkspaceAuth
|
||||
# enforce (Tier-2b). Real credential.
|
||||
# NEXT_PUBLIC_ADMIN_TOKEN — same value, baked into the canvas bundle so
|
||||
# the browser sends the matching bearer.
|
||||
# That makes middleware.isDevModeFailOpen() return true and lets the
|
||||
# canvas keep working without a bearer. Setting ADMIN_TOKEN here
|
||||
# would BREAK the canvas (it has no way to read that token in dev).
|
||||
#
|
||||
# For SaaS the platform is provisioned with a random ADMIN_TOKEN + the
|
||||
# canvas image baked with the matching NEXT_PUBLIC_ADMIN_TOKEN, plus
|
||||
# MOLECULE_ENV=production. Same shape, stronger secret.
|
||||
# For SaaS the platform is provisioned with ADMIN_TOKEN set AND
|
||||
# MOLECULE_ENV=production — either one closes the hatch. So the dev
|
||||
# mode signal here is safe (it's only active when both other knobs
|
||||
# are absent).
|
||||
if [ -f "$ENV_FILE" ] && grep -q '^MOLECULE_ENV=' "$ENV_FILE"; then
|
||||
echo "==> Reusing MOLECULE_ENV from existing .env"
|
||||
else
|
||||
echo "==> Setting MOLECULE_ENV=development in .env"
|
||||
echo "==> Setting MOLECULE_ENV=development in .env (dev-mode auth hatch)"
|
||||
{
|
||||
if [ -f "$ENV_FILE" ]; then
|
||||
cat "$ENV_FILE"
|
||||
echo ""
|
||||
fi
|
||||
echo "# Generated by scripts/dev-start.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
echo "# Local-dev conveniences (loopback bind, relaxed rate limit)."
|
||||
echo "# Auth is fail-closed even in dev — see ADMIN_TOKEN below."
|
||||
echo "# Local-dev auth posture: dev-mode fail-open lets the canvas at"
|
||||
echo "# localhost:3000 call admin endpoints without a bearer token."
|
||||
echo "# DO NOT set ADMIN_TOKEN here in dev — it would close the hatch"
|
||||
echo "# and the canvas would 401 on every admin call."
|
||||
echo "MOLECULE_ENV=development"
|
||||
} > "$ENV_FILE.tmp"
|
||||
mv "$ENV_FILE.tmp" "$ENV_FILE"
|
||||
echo " Saved to $ENV_FILE"
|
||||
fi
|
||||
|
||||
# Provision a deterministic dev ADMIN_TOKEN (idempotent — preserved across
|
||||
# re-runs). This is the credential the canvas authenticates with locally; it
|
||||
# is NOT a secret (it only guards your own localhost stack), so a fixed,
|
||||
# well-known value is fine and keeps re-runs reproducible.
|
||||
DEV_ADMIN_TOKEN="dev-local-admin-token"
|
||||
if [ -f "$ENV_FILE" ] && grep -q '^ADMIN_TOKEN=' "$ENV_FILE"; then
|
||||
echo "==> Reusing ADMIN_TOKEN from existing .env"
|
||||
else
|
||||
echo "==> Provisioning dev ADMIN_TOKEN in .env (fail-closed auth, authenticated canvas)"
|
||||
{
|
||||
cat "$ENV_FILE"
|
||||
echo ""
|
||||
echo "# Dev ADMIN_TOKEN — the canvas authenticates with this locally."
|
||||
echo "# Auth is fail-closed; without a matching bearer the canvas 401s."
|
||||
echo "# Fixed value is fine: it only guards your localhost stack."
|
||||
echo "ADMIN_TOKEN=$DEV_ADMIN_TOKEN"
|
||||
} > "$ENV_FILE.tmp"
|
||||
mv "$ENV_FILE.tmp" "$ENV_FILE"
|
||||
echo " Saved to $ENV_FILE"
|
||||
fi
|
||||
|
||||
# Source .env so the platform inherits ADMIN_TOKEN (and anything else
|
||||
# the user has added — e.g. ANTHROPIC_API_KEY for skipping the canvas
|
||||
# Secrets UI). `set -a` exports every assignment in the sourced file
|
||||
@@ -116,12 +95,6 @@ set -a
|
||||
. "$ENV_FILE"
|
||||
set +a
|
||||
|
||||
# The canvas reads NEXT_PUBLIC_ADMIN_TOKEN at build/dev time and attaches it
|
||||
# as the bearer on every platform call. Mirror the server-side ADMIN_TOKEN
|
||||
# into it so the matched-pair guard in canvas/next.config.ts is satisfied and
|
||||
# the browser authenticates. Exported for the `npm run dev` child below.
|
||||
export NEXT_PUBLIC_ADMIN_TOKEN="$ADMIN_TOKEN"
|
||||
|
||||
# ─────────────────────────────────────────────── 2. infra + templates
|
||||
|
||||
# Use setup.sh (not raw docker-compose) so the template registry gets
|
||||
@@ -222,9 +195,7 @@ cat <<EOF
|
||||
Molecule AI dev environment ready
|
||||
|
||||
Canvas: http://localhost:3000
|
||||
Platform: http://localhost:8080 (bound to loopback in dev)
|
||||
Auth: fail-closed — canvas authenticates with the dev ADMIN_TOKEN
|
||||
(ADMIN_TOKEN + NEXT_PUBLIC_ADMIN_TOKEN, see .env)
|
||||
Platform: http://localhost:8080
|
||||
Logs: /tmp/molecule-platform.log
|
||||
/tmp/molecule-canvas.log
|
||||
|
||||
|
||||
+3
-49
@@ -17,33 +17,6 @@ e2e_extract_token() {
|
||||
python3 "$(dirname "${BASH_SOURCE[0]}")/_extract_token.py"
|
||||
}
|
||||
|
||||
# Populate a curl-args array with the platform admin bearer, IF one is set.
|
||||
#
|
||||
# AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:161)
|
||||
# fail-opens ONLY while ADMIN_TOKEN is unset AND no workspace token exists yet
|
||||
# (devmode.go:50). The e2e-api CI job now sets ADMIN_TOKEN on the platform and
|
||||
# exports the matching MOLECULE_ADMIN_TOKEN here, which flips fail-open OFF — so
|
||||
# every admin-gated route (GET/POST/DELETE /workspaces, /events, /bundles,
|
||||
# /org/import, …) now requires the EXACT ADMIN_TOKEN as bearer (Tier-2b rejects
|
||||
# workspace bearers, wsauth_middleware.go:250). Helpers that hit admin routes
|
||||
# (e2e_cleanup_all_workspaces, e2e_delete_workspace's default path) must send it.
|
||||
#
|
||||
# Guarded if-set so a bootstrap/dev platform with no admin token (fail-open)
|
||||
# still works with zero auth. Mirrors e2e_mint_workspace_token's admin_auth.
|
||||
#
|
||||
# Usage:
|
||||
# local admin_auth=(); e2e_admin_auth_args admin_auth
|
||||
# curl -s "$BASE/workspaces" ${admin_auth[@]+"${admin_auth[@]}"}
|
||||
e2e_admin_auth_args() {
|
||||
local _outname="$1"
|
||||
local _bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
|
||||
if [ -n "$_bearer" ]; then
|
||||
eval "$_outname=(-H \"Authorization: Bearer \$_bearer\")"
|
||||
else
|
||||
eval "$_outname=()"
|
||||
fi
|
||||
}
|
||||
|
||||
# Delete every workspace currently on the platform. Use at the top of a
|
||||
# script so count-based assertions are reproducible across runs.
|
||||
# Mint a fresh workspace auth token via the real admin endpoint.
|
||||
@@ -80,38 +53,19 @@ e2e_delete_workspace() {
|
||||
if [ -z "$wid" ]; then
|
||||
return 0
|
||||
fi
|
||||
# DELETE /workspaces/:id and GET /workspaces/:id-for-name are both behind
|
||||
# AdminAuth (router.go:155 GET single is public, but List/Delete are gated at
|
||||
# router.go:165-167). Callers that already pass a per-workspace bearer (e.g.
|
||||
# test_api.sh's NEW_TOKEN) authenticate themselves; the cleanup-trap callers
|
||||
# in poll-mode/notify/priority pass NO curl args and rely on this fallback to
|
||||
# the platform admin bearer so the DELETE doesn't 401 once ADMIN_TOKEN is set.
|
||||
if [ "${#curl_args[@]}" -eq 0 ]; then
|
||||
e2e_admin_auth_args curl_args
|
||||
fi
|
||||
# ${curl_args[@]+"…"} guard: under `set -u` an empty array expands to an
|
||||
# "unbound variable" error on bash <4.4 (macOS 3.2, some Linux). This form
|
||||
# expands to nothing when the array is empty. Callers from the priority-
|
||||
# runtimes EXIT trap pass no extra curl args, so the array IS empty there —
|
||||
# without the guard the trap aborts non-zero AFTER the gate already passed,
|
||||
# turning a validated run RED. (Same idiom already used for CREATED_WSIDS.)
|
||||
if [ -z "$name" ]; then
|
||||
name=$(curl -s "$BASE/workspaces/$wid" ${curl_args[@]+"${curl_args[@]}"} | python3 -c "import json,sys
|
||||
name=$(curl -s "$BASE/workspaces/$wid" "${curl_args[@]}" | python3 -c "import json,sys
|
||||
try:
|
||||
print(json.load(sys.stdin).get('name',''))
|
||||
except Exception:
|
||||
pass" 2>/dev/null || true)
|
||||
fi
|
||||
curl -s -X DELETE "$BASE/workspaces/$wid?confirm=true" \
|
||||
-H "X-Confirm-Name: $name" ${curl_args[@]+"${curl_args[@]}"} > /dev/null || true
|
||||
-H "X-Confirm-Name: $name" "${curl_args[@]}" > /dev/null || true
|
||||
}
|
||||
|
||||
e2e_cleanup_all_workspaces() {
|
||||
# GET /workspaces (list) is AdminAuth-gated (router.go:165). Send the platform
|
||||
# admin bearer if one is set so the list doesn't 401 → empty → no cleanup.
|
||||
local _admin_auth=()
|
||||
e2e_admin_auth_args _admin_auth
|
||||
curl -s "$BASE/workspaces" ${_admin_auth[@]+"${_admin_auth[@]}"} | python3 -c "import json,sys
|
||||
curl -s "$BASE/workspaces" | python3 -c "import json,sys
|
||||
try:
|
||||
[print(f\"{w.get('id','')}\\t{w.get('name','')}\") for w in json.load(sys.stdin)]
|
||||
except Exception:
|
||||
|
||||
@@ -11,10 +11,7 @@
|
||||
# default + 401, see PR #1714.)
|
||||
#
|
||||
# claude-code → auth-aware:
|
||||
# E2E_MINIMAX_API_KEY → "minimax:MiniMax-M2.7"
|
||||
# (colon-namespaced BYOK id; bare
|
||||
# "MiniMax-M2" 400s on a deploy-skewed
|
||||
# staging registry — #2263)
|
||||
# E2E_MINIMAX_API_KEY → "MiniMax-M2"
|
||||
# E2E_ANTHROPIC_API_KEY → "claude-sonnet-4-6"
|
||||
# otherwise → "sonnet"
|
||||
#
|
||||
@@ -26,76 +23,28 @@
|
||||
# their provider entries, otherwise the workspace boots
|
||||
# reachable but the first A2A call hits the wrong auth path.
|
||||
#
|
||||
# PLATFORM-MANAGED path (E2E_LLM_PATH=platform) — the moonshot/kimi
|
||||
# NOT_CONFIGURED regression (RFC#340 Fix A #2187):
|
||||
#
|
||||
# The branches above all exercise BYOK: a tenant key (MINIMAX/ANTHROPIC/
|
||||
# OPENAI) is injected as a workspace secret and the model id resolves to that
|
||||
# vendor's *BYOK* provider entry. That path NEVER exercises the platform arm —
|
||||
# the exact arm that booted "moonshot/kimi-k2.6" into NOT_CONFIGURED in prod,
|
||||
# because the generated config.yaml lacked the derived `provider: platform`.
|
||||
#
|
||||
# E2E_LLM_PATH=platform selects a platform-managed model id (slash-namespaced,
|
||||
# no tenant key — Molecule owns billing via the CP LLM proxy). The default is
|
||||
# "moonshot/kimi-k2.6", the headline incident combo. Override the specific
|
||||
# platform model with E2E_MODEL_SLUG. The provision branch in
|
||||
# test_staging_full_saas.sh sends NO secrets for this path (platform-managed
|
||||
# needs none), so the workspace must boot online purely on the proxy env the
|
||||
# control plane injects + the manifest-derived `provider: platform` that Fix A
|
||||
# stamps. That is the REAL boot-path assertion the deterministic unit test
|
||||
# (workspace_provision_platform_boot_test.go) cannot make.
|
||||
#
|
||||
# When E2E_MODEL_SLUG is set, it overrides this dispatch entirely — useful when
|
||||
# an operator dispatches the workflow to test a specific slug (or a specific
|
||||
# platform model id).
|
||||
# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an
|
||||
# operator dispatches the workflow to test a specific slug.
|
||||
#
|
||||
# Unit tested by tests/e2e/test_model_slug.sh — every branch must stay
|
||||
# pinned because regressions silently mask as "Could not resolve
|
||||
# authentication method" + the synth-E2E gate goes red without naming
|
||||
# the slug-format mismatch.
|
||||
|
||||
# Default platform-managed model for the platform-boot regression path. The
|
||||
# exact id that booted NOT_CONFIGURED in prod. Must stay a member of the
|
||||
# claude-code `platform` arm in workspace-server/internal/providers/providers.yaml
|
||||
# (the deterministic suite TestEnsureDefaultConfig_StampsProviderForEverySSOTPlatformModel
|
||||
# enforces every member of that arm derives provider=platform). Resolved INSIDE
|
||||
# pick_model_slug via ${E2E_DEFAULT_PLATFORM_MODEL:-...} so callers can override
|
||||
# it (or unset it) without tripping `set -u`.
|
||||
E2E_DEFAULT_PLATFORM_MODEL_FALLBACK="moonshot/kimi-k2.6"
|
||||
|
||||
# Usage: pick_model_slug <runtime>
|
||||
# stdout: the slug string
|
||||
# E2E_MODEL_SLUG (env): if set + non-empty, used as-is (operator override)
|
||||
# E2E_LLM_PATH=platform (env): select the platform-managed model id
|
||||
# (E2E_DEFAULT_PLATFORM_MODEL) instead of a BYOK slug. Takes precedence over
|
||||
# the per-key BYOK branches; E2E_MODEL_SLUG still wins over everything.
|
||||
pick_model_slug() {
|
||||
local runtime="${1:-}"
|
||||
if [ -n "${E2E_MODEL_SLUG:-}" ]; then
|
||||
printf '%s' "$E2E_MODEL_SLUG"
|
||||
return 0
|
||||
fi
|
||||
# Platform-managed path: the slash-namespaced platform model, no tenant key.
|
||||
# Exercises the arm the moonshot/kimi NOT_CONFIGURED bug shipped on.
|
||||
if [ "${E2E_LLM_PATH:-}" = "platform" ]; then
|
||||
printf '%s' "${E2E_DEFAULT_PLATFORM_MODEL:-$E2E_DEFAULT_PLATFORM_MODEL_FALLBACK}"
|
||||
return 0
|
||||
fi
|
||||
case "$runtime" in
|
||||
hermes) printf 'openai/gpt-4o' ;;
|
||||
claude-code)
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
# Namespaced (colon) BYOK id, not bare "MiniMax-M2" (#2263 deploy-skew):
|
||||
# bare ids can lag the deployed staging ws-server's compiled registry,
|
||||
# so workspace-create's validateRegisteredModelForRuntime 400s the bare
|
||||
# form on an older image. The colon-namespaced `minimax:MiniMax-M2.7`
|
||||
# resolves the same way the proven-working sibling `moonshot/kimi-k2.6`
|
||||
# does. It stays in the BYOK `minimax` arm (providers.yaml:851), so
|
||||
# DeriveProvider -> provider_selection=minimax (BYOK) and the #1994
|
||||
# byok-not-platform guard (test_staging_full_saas.sh:1000) still passes —
|
||||
# unlike the slash/platform form `minimax/MiniMax-M2.7`, which resolves
|
||||
# to provider=platform and would trip that guard.
|
||||
printf 'minimax:MiniMax-M2.7'
|
||||
printf 'MiniMax-M2'
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
printf 'claude-sonnet-4-6'
|
||||
else
|
||||
|
||||
+41
-61
@@ -15,27 +15,18 @@ SUM_AUTH=()
|
||||
ECHO_URL="https://example.com/echo-agent"
|
||||
SUM_URL="https://example.com/summarizer-agent"
|
||||
|
||||
# AdminAuth-gated calls (GET/POST/DELETE /workspaces, /events, /bundles)
|
||||
# require the platform admin bearer once ADMIN_TOKEN is set on the server.
|
||||
# Tier-2b (wsauth_middleware.go:250) REJECTS workspace bearer tokens on admin
|
||||
# routes when ADMIN_TOKEN is set, so admin calls MUST send the exact ADMIN_TOKEN
|
||||
# value — which the e2e-api CI job exports here as MOLECULE_ADMIN_TOKEN. acurl =
|
||||
# "admin curl": it always sends the platform admin bearer (if one is set).
|
||||
#
|
||||
# Guarded if-set: a fresh self-hosted/dev platform with no ADMIN_TOKEN fail-opens
|
||||
# (devmode.go:50), so sending no bearer still works there.
|
||||
ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
|
||||
ADMIN_AUTH=()
|
||||
[ -n "$ADMIN_BEARER" ] && ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
|
||||
# AdminAuth-gated calls need a bearer token once any workspace token
|
||||
# exists in the DB. ADMIN_TOKEN is populated after the first workspace
|
||||
# create + real token mint. acurl = "authenticated curl".
|
||||
ADMIN_TOKEN=""
|
||||
acurl() {
|
||||
curl -s ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} "$@"
|
||||
if [ -n "$ADMIN_TOKEN" ]; then
|
||||
curl -s -H "Authorization: Bearer $ADMIN_TOKEN" "$@"
|
||||
else
|
||||
curl -s "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
# WORKSPACE_TOKEN holds a per-workspace bearer for the WorkspaceAuth-gated
|
||||
# routes (PATCH /workspaces/:id, /activity, …). It is set after the first
|
||||
# create+mint and is NOT interchangeable with the admin bearer.
|
||||
WORKSPACE_TOKEN=""
|
||||
|
||||
# Pre-test cleanup: remove any workspaces left over from prior runs so
|
||||
# count-based assertions ("empty", "count=2") are reproducible.
|
||||
e2e_cleanup_all_workspaces
|
||||
@@ -66,22 +57,19 @@ check "GET /health" '"status":"ok"' "$R"
|
||||
R=$(acurl "$BASE/workspaces")
|
||||
check "GET /workspaces (empty)" '[]' "$R"
|
||||
|
||||
# Test 3: Create workspace A. POST /workspaces is AdminAuth-gated (router.go:166);
|
||||
# send the admin bearer (acurl). On a fail-open dev platform acurl sends nothing
|
||||
# and the create still works.
|
||||
R=$(acurl -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}')
|
||||
# Test 3: Create workspace A (AdminAuth fail-open — no tokens exist yet)
|
||||
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}')
|
||||
check "POST /workspaces (create echo)" '"status":"awaiting_agent"' "$R"
|
||||
ECHO_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
|
||||
|
||||
# Per-workspace token for Echo, for the WorkspaceAuth-gated routes below.
|
||||
WORKSPACE_TOKEN=$(echo "$R" | e2e_extract_token)
|
||||
if [ -z "$WORKSPACE_TOKEN" ]; then
|
||||
WORKSPACE_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "")
|
||||
ADMIN_TOKEN=$(echo "$R" | e2e_extract_token)
|
||||
if [ -z "$ADMIN_TOKEN" ]; then
|
||||
ADMIN_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "")
|
||||
fi
|
||||
if [ -n "$WORKSPACE_TOKEN" ]; then
|
||||
echo " (acquired Echo workspace token: ${WORKSPACE_TOKEN:0:8}...)"
|
||||
if [ -n "$ADMIN_TOKEN" ]; then
|
||||
echo " (acquired admin token: ${ADMIN_TOKEN:0:8}...)"
|
||||
else
|
||||
echo " WARNING: no Echo workspace token acquired — WorkspaceAuth calls will fail"
|
||||
echo " WARNING: no admin token acquired — subsequent AdminAuth calls will fail"
|
||||
fi
|
||||
|
||||
# Test 4: Create workspace B (needs bearer — tokens now exist in DB)
|
||||
@@ -110,7 +98,7 @@ check "GET /workspaces/:id (agent_card null)" '"agent_card":null' "$R"
|
||||
# Test 7: Register echo — use workspace-specific token (from real admin
|
||||
# endpoint), not the admin token. C18 requires a token issued TO THIS
|
||||
# workspace, not just any valid token.
|
||||
ECHO_WS_TOKEN="$WORKSPACE_TOKEN"
|
||||
ECHO_WS_TOKEN="$ADMIN_TOKEN"
|
||||
[ -n "$ECHO_WS_TOKEN" ] && ECHO_AUTH=(-H "Authorization: Bearer $ECHO_WS_TOKEN")
|
||||
R=$(curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" \
|
||||
"${ECHO_AUTH[@]}" \
|
||||
@@ -171,29 +159,26 @@ R=$(curl -s -X POST "$BASE/registry/check-access" -H "Content-Type: application/
|
||||
-d "{\"caller_id\":\"$ECHO_ID\",\"target_id\":\"$SUM_ID\"}")
|
||||
check "POST /registry/check-access (same-org allowed)" '"allowed":true' "$R"
|
||||
|
||||
# Test 15: PATCH workspace (update position). PATCH /workspaces/:id is
|
||||
# WorkspaceAuth-gated (router.go:227 — #680 IDOR fix), so it needs Echo's OWN
|
||||
# bearer, NOT the admin bearer (WorkspaceAuth rejects the admin token).
|
||||
R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}')
|
||||
# Test 15: PATCH workspace (update position)
|
||||
R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}')
|
||||
check "PATCH /workspaces/:id (position)" '"status":"updated"' "$R"
|
||||
|
||||
R=$(acurl "$BASE/workspaces/$ECHO_ID")
|
||||
check "Position saved (x=100)" '"x":100' "$R"
|
||||
check "Position saved (y=200)" '"y":200' "$R"
|
||||
|
||||
# Test 16: PATCH workspace (update name) — WorkspaceAuth-gated; use Echo's token.
|
||||
R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}')
|
||||
# Test 16: PATCH workspace (update name)
|
||||
R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}')
|
||||
check "PATCH /workspaces/:id (name)" '"status":"updated"' "$R"
|
||||
|
||||
R=$(acurl "$BASE/workspaces/$ECHO_ID")
|
||||
check "Name updated" '"name":"Echo Agent v2"' "$R"
|
||||
|
||||
# Test 17: Events (#165 / PR #167 — admin-gated; the admin bearer is required,
|
||||
# and Tier-2b rejects a workspace bearer here, so use acurl's admin token alone).
|
||||
R=$(acurl "$BASE/events")
|
||||
# Test 17: Events (#165 / PR #167 — now admin-gated, bearer required)
|
||||
R=$(acurl "$BASE/events" -H "Authorization: Bearer $ECHO_TOKEN")
|
||||
check "GET /events (has events)" 'WORKSPACE_ONLINE' "$R"
|
||||
|
||||
R=$(acurl "$BASE/events/$ECHO_ID")
|
||||
R=$(acurl "$BASE/events/$ECHO_ID" -H "Authorization: Bearer $ECHO_TOKEN")
|
||||
check "GET /events/:id (has events for echo)" 'WORKSPACE_ONLINE' "$R"
|
||||
|
||||
# Test 18: Update card
|
||||
@@ -310,7 +295,7 @@ check "active_tasks cleared" '"active_tasks":0' "$R"
|
||||
# endpoint is admin-auth gated and keeps the full record, so operators
|
||||
# can still see task progress from the dashboard without exposing it
|
||||
# over the public per-workspace GET.
|
||||
R=$(acurl "$BASE/workspaces")
|
||||
R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN")
|
||||
check "current_task in list response" '"current_task"' "$R"
|
||||
|
||||
# Test 21: Delete
|
||||
@@ -321,20 +306,18 @@ check "current_task in list response" '"current_task"' "$R"
|
||||
# Delete the CHILD (Summarizer) here instead: a child delete does NOT cascade
|
||||
# upward, so the parent Echo survives and count=1 holds. The bundle round-trip
|
||||
# below needs Summarizer's exported config, so capture it BEFORE this delete.
|
||||
# GET /bundles/export/:id is admin-gated (router.go:741) — use the admin bearer.
|
||||
BUNDLE=$(acurl "$BASE/bundles/export/$SUM_ID")
|
||||
BUNDLE=$(curl -s "$BASE/bundles/export/$SUM_ID" -H "Authorization: Bearer $SUM_TOKEN")
|
||||
check "GET /bundles/export/:id" '"name":"Summarizer Agent"' "$BUNDLE"
|
||||
ORIG_NAME=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['name'])")
|
||||
ORIG_TIER=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['tier'])")
|
||||
|
||||
# DELETE /workspaces/:id is admin-gated (router.go:167). X-Confirm-Name must
|
||||
# still match the workspace name even with admin auth.
|
||||
R=$(acurl -X DELETE "$BASE/workspaces/$SUM_ID?confirm=true" \
|
||||
-H "Authorization: Bearer $SUM_TOKEN" \
|
||||
-H "X-Confirm-Name: Summarizer Agent")
|
||||
check "DELETE /workspaces/:id" '"status":"removed"' "$R"
|
||||
|
||||
# Parent Echo must survive a child delete — list (admin) and expect count=1.
|
||||
R=$(acurl "$BASE/workspaces")
|
||||
# Parent Echo must survive a child delete — list as Echo and expect count=1.
|
||||
R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN")
|
||||
COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
|
||||
check "List after delete (count=1)" "1" "$COUNT"
|
||||
|
||||
@@ -345,21 +328,21 @@ check "List after delete (count=1)" "1" "$COUNT"
|
||||
echo ""
|
||||
echo "--- Bundle Round-Trip Test ---"
|
||||
|
||||
# Delete the remaining parent Echo — DELETE is admin-gated (router.go:167);
|
||||
# the platform admin bearer (acurl) authorizes it. X-Confirm-Name still required.
|
||||
# Delete the remaining parent Echo — use ECHO_TOKEN (per-workspace) for
|
||||
# WorkspaceAuth and ADMIN_TOKEN for the AdminAuth layer.
|
||||
R=$(acurl -X DELETE "$BASE/workspaces/$ECHO_ID?confirm=true" \
|
||||
-H "Authorization: Bearer $ECHO_TOKEN" \
|
||||
-H "X-Confirm-Name: Echo Agent v2")
|
||||
check "Delete before re-import" '"status":"removed"' "$R"
|
||||
|
||||
# Both workspaces are now deleted. The platform-level ADMIN_TOKEN env is still
|
||||
# set, so admin routes still require the admin bearer (fail-open does NOT
|
||||
# re-engage just because the token table emptied) — keep using acurl's bearer.
|
||||
# After deleting both workspaces, all per-workspace tokens are revoked.
|
||||
# Clear the now-revoked admin bearer so acurl can use fresh-install fail-open.
|
||||
ADMIN_TOKEN=""
|
||||
R=$(acurl "$BASE/workspaces")
|
||||
COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
|
||||
check "All workspaces deleted (count=0)" "0" "$COUNT"
|
||||
|
||||
# Re-import from the exported bundle. POST /bundles/import is admin-gated
|
||||
# (router.go:742) — acurl sends the admin bearer.
|
||||
# Re-import from the exported bundle (AdminAuth fail-open — no live tokens)
|
||||
R=$(acurl -X POST "$BASE/bundles/import" -H "Content-Type: application/json" -d "$BUNDLE")
|
||||
check "POST /bundles/import" '"status":"provisioning"' "$R"
|
||||
NEW_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['workspace_id'])")
|
||||
@@ -415,15 +398,12 @@ check "Register re-imported workspace" '"status":"registered"' "$R"
|
||||
REG_NEW_TOKEN=$(echo "$R" | e2e_extract_token)
|
||||
[ -n "$REG_NEW_TOKEN" ] && NEW_TOKEN="$REG_NEW_TOKEN"
|
||||
|
||||
# Re-export and verify agent_card survives the round-trip (#165 / PR #167 —
|
||||
# GET /bundles/export/:id is admin-gated; use the admin bearer).
|
||||
REBUNDLE=$(acurl "$BASE/bundles/export/$NEW_ID")
|
||||
# Re-export and verify agent_card survives the round-trip (#165 / PR #167 — admin-gated)
|
||||
REBUNDLE=$(curl -s "$BASE/bundles/export/$NEW_ID" -H "Authorization: Bearer $NEW_TOKEN")
|
||||
check "Re-exported bundle has agent_card" '"agent_card"' "$REBUNDLE"
|
||||
|
||||
# Clean up — DELETE /workspaces/:id is admin-gated; pass no per-call auth so
|
||||
# e2e_delete_workspace falls back to the platform admin bearer (a workspace
|
||||
# bearer would be rejected by Tier-2b).
|
||||
e2e_delete_workspace "$NEW_ID" "$ORIG_NAME"
|
||||
# Clean up — use the token just issued to the re-imported workspace
|
||||
e2e_delete_workspace "$NEW_ID" "$ORIG_NAME" -H "Authorization: Bearer $NEW_TOKEN"
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
|
||||
+45
-72
@@ -1,30 +1,24 @@
|
||||
#!/usr/bin/env bash
|
||||
# E2E regression suite asserting that "dev mode" is fail-CLOSED.
|
||||
# E2E regression suite for the local-dev escape hatches added in
|
||||
# fix/quickstart-bugless. These cover the exact user-facing breakages
|
||||
# that dropped out of the partial squash-merge of PR #1871:
|
||||
#
|
||||
# History: this file used to assert the local-dev fail-open escape hatches
|
||||
# (GET /workspaces 200 with NO bearer, /workspaces/:id/activity 200 with no
|
||||
# bearer) added in fix/quickstart-bugless. Under the CTO "nothing should be
|
||||
# fail-open" directive (harden/no-fail-open-auth) those hatches were REMOVED:
|
||||
# auth is fail-CLOSED in EVERY environment, local dev included. This suite now
|
||||
# pins the inverse contract — bearer-less admin/workspace requests 401, and the
|
||||
# SAME requests with the dev ADMIN_TOKEN bearer succeed.
|
||||
# 1. GET /workspaces returns 200 with no bearer after tokens exist in
|
||||
# the DB — exercises the AdminAuth Tier-1b dev-mode hatch
|
||||
# (middleware/devmode.go::isDevModeFailOpen).
|
||||
# 2. GET /workspaces/:id/activity returns 200 with no bearer — the
|
||||
# same hatch applied to WorkspaceAuth.
|
||||
# 3. POST /workspaces/:id/a2a doesn't 502-SSRF on a loopback workspace
|
||||
# URL — exercises handlers/ssrf.go::devModeAllowsLoopback.
|
||||
# 4. GET /org/templates returns the curated set populated by
|
||||
# clone-manifest.sh — exercises infra/scripts/setup.sh + the
|
||||
# ListTemplates failure logging in handlers/org.go.
|
||||
#
|
||||
# What it verifies:
|
||||
# 1. GET /workspaces 401s with NO bearer once tokens exist (was: 200 via the
|
||||
# removed AdminAuth Tier-1b dev-mode hatch); 200 WITH the admin bearer.
|
||||
# 2. GET /workspaces/:id/activity (and /delegations, /approvals/pending) 401
|
||||
# with no bearer (was: 200 via the WorkspaceAuth hatch); 200 WITH bearer.
|
||||
# 3. GET /org/templates returns the curated set populated by clone-manifest.sh
|
||||
# (unauth-readable bootstrap surface — unchanged).
|
||||
#
|
||||
# Requires: platform running on :8080 with MOLECULE_ENV=development AND
|
||||
# ADMIN_TOKEN set (the dev value), with MOLECULE_ADMIN_TOKEN (or
|
||||
# ADMIN_TOKEN) exported here so the suite can present the bearer.
|
||||
# scripts/dev-start.sh provisions ADMIN_TOKEN locally; the e2e-api CI
|
||||
# job sets it on the platform and exports the matching bearer.
|
||||
# Requires: platform running on :8080 with MOLECULE_ENV=development and
|
||||
# ADMIN_TOKEN unset. Matches the README quickstart env.
|
||||
#
|
||||
# Usage:
|
||||
# MOLECULE_ADMIN_TOKEN=dev-local-admin-token bash tests/e2e/test_dev_mode.sh
|
||||
# bash tests/e2e/test_dev_mode.sh
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=_lib.sh
|
||||
@@ -52,44 +46,35 @@ check_http() {
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== Dev-mode fail-CLOSED regression tests ==="
|
||||
echo "=== Dev-mode escape-hatch regression tests ==="
|
||||
echo ""
|
||||
|
||||
# The platform is fail-closed in every environment now, so the suite MUST have
|
||||
# the admin bearer to drive the authenticated (200) assertions. Without it we
|
||||
# cannot create / clean up workspaces — bail loudly rather than silently skip.
|
||||
ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
|
||||
if [ -z "$ADMIN_BEARER" ]; then
|
||||
echo "FAIL: MOLECULE_ADMIN_TOKEN/ADMIN_TOKEN not set — auth is fail-closed in"
|
||||
echo " every environment, so this suite needs the dev ADMIN_TOKEN bearer."
|
||||
echo " e.g. MOLECULE_ADMIN_TOKEN=dev-local-admin-token bash $0"
|
||||
exit 1
|
||||
fi
|
||||
ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
|
||||
# Pre-test: ensure MOLECULE_ENV=development and no ADMIN_TOKEN are in the
|
||||
# platform's env. The request path doesn't let us read the platform's
|
||||
# env directly, but we can verify the hatch is active by confirming the
|
||||
# expected behaviour under the conditions the test otherwise sets up.
|
||||
|
||||
e2e_cleanup_all_workspaces
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Section 1 — AdminAuth is fail-CLOSED (dev-mode hatch removed)
|
||||
# Section 1 — AdminAuth dev-mode hatch
|
||||
# ----------------------------------------------------------------------
|
||||
echo "--- Section 1: AdminAuth fail-closed ---"
|
||||
# Before fix: once any workspace had tokens in the DB, GET /workspaces
|
||||
# closed to unauthenticated callers and the Canvas broke. The hatch
|
||||
# keeps it open specifically in dev mode.
|
||||
|
||||
echo "--- Section 1: AdminAuth dev-mode hatch ---"
|
||||
|
||||
# No bearer → 401 in dev mode (the removed hatch used to return 200).
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces")
|
||||
check_http "GET /workspaces (no bearer) is fail-CLOSED" "401" "$R"
|
||||
check_http "GET /workspaces (empty DB)" "200" "$R"
|
||||
|
||||
# With the dev admin bearer → 200.
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces" "${ADMIN_AUTH[@]}")
|
||||
check_http "GET /workspaces (with admin bearer)" "200" "$R"
|
||||
|
||||
# Create a workspace (authenticated) so tokens land in the DB.
|
||||
# Create a workspace so tokens land in the DB.
|
||||
R=$(curl -s -w "\n%{http_code}" -X POST "$BASE/workspaces" \
|
||||
"${ADMIN_AUTH[@]}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name":"Dev-Mode-Test","tier":1,"runtime":"external","external":true}')
|
||||
CODE=$(echo "$R" | tail -n1)
|
||||
BODY=$(echo "$R" | sed '$d')
|
||||
check_http "POST /workspaces (create, with admin bearer)" "201" "$CODE"
|
||||
check_http "POST /workspaces (create)" "201" "$CODE"
|
||||
|
||||
WS_ID=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || true)
|
||||
if [ -z "$WS_ID" ]; then
|
||||
@@ -98,55 +83,43 @@ if [ -z "$WS_ID" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure a real workspace token exists so AdminAuth sees a live token globally.
|
||||
# Ensure a real workspace token exists so AdminAuth now sees a live token. On
|
||||
# pre-fix builds the next /workspaces call would 401 — on post-fix it
|
||||
# must stay 200 because MOLECULE_ENV=development + ADMIN_TOKEN unset.
|
||||
TOKEN=$(echo "$BODY" | e2e_extract_token)
|
||||
if [ -z "$TOKEN" ]; then
|
||||
e2e_mint_workspace_token "$WS_ID" >/dev/null
|
||||
fi
|
||||
|
||||
# With tokens now in the DB, the bearer-less call STILL 401s (no lazy-bootstrap
|
||||
# / dev-mode fall-through), and the authenticated call still 200s.
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces")
|
||||
check_http "GET /workspaces (after token minted, no bearer) is fail-CLOSED" "401" "$R"
|
||||
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces" "${ADMIN_AUTH[@]}")
|
||||
check_http "GET /workspaces (after token minted, with admin bearer)" "200" "$R"
|
||||
check_http "GET /workspaces (after token minted, no bearer)" "200" "$R"
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Section 2 — WorkspaceAuth is fail-CLOSED (dev-mode hatch removed)
|
||||
# Section 2 — WorkspaceAuth dev-mode hatch
|
||||
# ----------------------------------------------------------------------
|
||||
# Before fix: /workspaces/:id/activity 401'd once tokens existed —
|
||||
# the Canvas side panel's chat history load broke.
|
||||
|
||||
echo ""
|
||||
echo "--- Section 2: WorkspaceAuth fail-closed ---"
|
||||
echo "--- Section 2: WorkspaceAuth dev-mode hatch ---"
|
||||
|
||||
# No bearer → 401 (the removed hatch used to return 200).
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
"$BASE/workspaces/$WS_ID/activity?type=a2a_receive&limit=50")
|
||||
check_http "GET /workspaces/:id/activity (no bearer) is fail-CLOSED" "401" "$R"
|
||||
check_http "GET /workspaces/:id/activity (no bearer)" "200" "$R"
|
||||
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
"$BASE/workspaces/$WS_ID/delegations")
|
||||
check_http "GET /workspaces/:id/delegations (no bearer) is fail-CLOSED" "401" "$R"
|
||||
check_http "GET /workspaces/:id/delegations (no bearer)" "200" "$R"
|
||||
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/approvals/pending")
|
||||
check_http "GET /approvals/pending (no bearer) is fail-CLOSED" "401" "$R"
|
||||
|
||||
# Same requests WITH the admin bearer → 200.
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
"$BASE/workspaces/$WS_ID/activity?type=a2a_receive&limit=50" "${ADMIN_AUTH[@]}")
|
||||
check_http "GET /workspaces/:id/activity (with admin bearer)" "200" "$R"
|
||||
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
"$BASE/workspaces/$WS_ID/delegations" "${ADMIN_AUTH[@]}")
|
||||
check_http "GET /workspaces/:id/delegations (with admin bearer)" "200" "$R"
|
||||
|
||||
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/approvals/pending" "${ADMIN_AUTH[@]}")
|
||||
check_http "GET /approvals/pending (with admin bearer)" "200" "$R"
|
||||
check_http "GET /approvals/pending (no bearer)" "200" "$R"
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Section 3 — Template registry populated by setup.sh
|
||||
# ----------------------------------------------------------------------
|
||||
# GET /org/templates is an unauthenticated bootstrap surface (the template
|
||||
# palette must render before the user has a credential) — unchanged.
|
||||
# Before fix: setup.sh didn't run clone-manifest.sh so the template
|
||||
# palette was empty and the molecule-dev in-tree copy was broken.
|
||||
|
||||
echo ""
|
||||
echo "--- Section 3: Template registry ---"
|
||||
|
||||
|
||||
@@ -1,332 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
#
|
||||
# test_keyless_feature_contracts_e2e.sh — REQUIRED-lane (E2E API Smoke Test)
|
||||
# keyless HTTP-contract coverage for feature endpoints that ship WITHOUT an
|
||||
# LLM key and had NO e2e assertion before (coverage-audit gap list).
|
||||
#
|
||||
# Why a NEW script (not added to test_api.sh): PR #2286 is concurrently
|
||||
# rewriting test_api.sh's auth helpers + _lib.sh (e2e_admin_auth_args) and the
|
||||
# test_priority_runtimes mock arm. Keeping these assertions in a standalone
|
||||
# file avoids a merge conflict with that in-flight PR and keeps the new feature
|
||||
# coverage independently reviewable. The mock-runtime A2A canned round-trip is
|
||||
# OWNED by #2286's `mock` arm (run_mock) — intentionally NOT duplicated here.
|
||||
#
|
||||
# Every endpoint below is exercised against a runtime=external workspace so NO
|
||||
# LLM key is needed. For each we assert the real HTTP contract: the happy path
|
||||
# AND a meaningful failure mode (401 without auth, 400 on bad input, or the
|
||||
# documented fail-closed status) so the test catches REAL regressions, not
|
||||
# just 200s.
|
||||
#
|
||||
# Auth model (matches workspace-server/internal/middleware/wsauth_middleware.go):
|
||||
# * WorkspaceAuth (/workspaces/:id/*) is STRICT once a token exists — a
|
||||
# bearer-less request 401s (devmode fail-open needs MOLECULE_ENV=dev AND
|
||||
# ADMIN_TOKEN unset, neither of which the e2e-api job sets).
|
||||
# * AdminAuth routes accept the platform ADMIN_TOKEN (post-#2286) OR, when no
|
||||
# ADMIN_TOKEN is configured, any valid workspace bearer (Tier-3 fallback) —
|
||||
# so the workspace token we mint authenticates admin routes in BOTH the
|
||||
# pre-#2286 (no ADMIN_TOKEN) and post-#2286 (ADMIN_TOKEN set) CI shapes.
|
||||
#
|
||||
# Local-run shape (mirrors the e2e-api job — real PG+Redis+platform):
|
||||
# DATABASE_URL=... REDIS_URL=... ADMIN_TOKEN=... ./platform-server &
|
||||
# BASE=http://127.0.0.1:$PORT bash tests/e2e/test_keyless_feature_contracts_e2e.sh
|
||||
|
||||
source "$(dirname "$0")/_lib.sh" # sets BASE default
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
pass() { echo "PASS: $1"; PASS=$((PASS + 1)); }
|
||||
fail() { echo "FAIL: $1"; echo " $2"; FAIL=$((FAIL + 1)); }
|
||||
|
||||
# assert_contains DESC EXPECTED_SUBSTRING ACTUAL
|
||||
assert_contains() {
|
||||
if printf '%s' "$3" | grep -qF "$2"; then
|
||||
pass "$1"
|
||||
else
|
||||
fail "$1" "expected to contain [$2] — got: $3"
|
||||
fi
|
||||
}
|
||||
|
||||
# http_code METHOD URL [curl-args...] → prints the HTTP status code only.
|
||||
http_code() {
|
||||
local method="$1" url="$2"; shift 2
|
||||
curl -s -o /dev/null -w "%{http_code}" -X "$method" "$url" "$@"
|
||||
}
|
||||
|
||||
# body_and_code METHOD URL [curl-args...] → prints "<body>\n<code>".
|
||||
body_and_code() {
|
||||
local method="$1" url="$2"; shift 2
|
||||
curl -s -w $'\n%{http_code}' -X "$method" "$url" "$@"
|
||||
}
|
||||
|
||||
echo "=== Keyless feature HTTP-contract E2E (required lane) ==="
|
||||
echo ""
|
||||
|
||||
# Platform admin bearer when the job set one (#2286 shape). When ADMIN_TOKEN is
|
||||
# configured, AdminAuth's Tier-1 fail-open is OFF even before the first token
|
||||
# exists, so admin-gated create / list / delete must carry it from the start.
|
||||
# Pre-#2286 (no ADMIN_TOKEN) this is empty → fail-open create works bare.
|
||||
ENV_ADMIN="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
|
||||
ENV_ADMIN_AUTH=()
|
||||
[ -n "$ENV_ADMIN" ] && ENV_ADMIN_AUTH=(-H "Authorization: Bearer $ENV_ADMIN")
|
||||
|
||||
# Reproducible counts across reruns. e2e_cleanup_all_workspaces hits the
|
||||
# admin-gated list/delete; the platform admin bearer (if set) goes via the
|
||||
# MOLECULE_ADMIN_TOKEN/ADMIN_TOKEN env the helper already reads.
|
||||
e2e_cleanup_all_workspaces
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixture: one external workspace, registered → online. Keyless (external=true
|
||||
# means no container is provisioned and no LLM key is consulted).
|
||||
# ---------------------------------------------------------------------------
|
||||
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
|
||||
${ENV_ADMIN_AUTH[@]+"${ENV_ADMIN_AUTH[@]}"} \
|
||||
-d '{"name":"Keyless Fixture","tier":1,"runtime":"external","external":true}')
|
||||
WS_ID=$(printf '%s' "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
|
||||
if [ -z "$WS_ID" ]; then
|
||||
echo "FATAL: could not create fixture workspace — got: $R" >&2
|
||||
exit 2
|
||||
fi
|
||||
assert_contains "POST /workspaces (external fixture created)" '"status":"awaiting_agent"' "$R"
|
||||
|
||||
# Workspace token: register returns one; else mint via the admin endpoint.
|
||||
WS_TOKEN=$(printf '%s' "$R" | e2e_extract_token)
|
||||
if [ -z "$WS_TOKEN" ]; then
|
||||
WS_TOKEN=$(e2e_mint_workspace_token "$WS_ID" 2>/dev/null || echo "")
|
||||
fi
|
||||
if [ -z "$WS_TOKEN" ]; then
|
||||
echo "FATAL: could not obtain workspace token for $WS_ID" >&2
|
||||
exit 2
|
||||
fi
|
||||
AUTH=(-H "Authorization: Bearer $WS_TOKEN")
|
||||
|
||||
# Admin bearer: explicit platform ADMIN_TOKEN if the job set one (#2286 shape),
|
||||
# else the workspace token (AdminAuth Tier-3 accepts it pre-#2286).
|
||||
ADMIN_BEARER="${ENV_ADMIN:-$WS_TOKEN}"
|
||||
ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
|
||||
|
||||
# Bring the fixture online so lifecycle (hibernate) has a hibernatable state.
|
||||
curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
|
||||
|
||||
# ===========================================================================
|
||||
# 1. Terminal diagnose — GET /workspaces/:id/terminal/diagnose (wsAuth)
|
||||
# External workspace has no instance_id → diagnoseLocal path → 200 with a
|
||||
# deterministic report (ok=false, first_failure on docker/container). The
|
||||
# /terminal endpoint itself is a WebSocket upgrade (not HTTP-assertable
|
||||
# keyless); diagnose is its pure-HTTP sibling and the real contract surface.
|
||||
# ===========================================================================
|
||||
echo "--- /terminal/diagnose ---"
|
||||
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose" "${AUTH[@]}")
|
||||
DIAG_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
DIAG_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
assert_contains "GET /terminal/diagnose (200 report)" "200" "$DIAG_CODE"
|
||||
assert_contains "GET /terminal/diagnose (carries workspace_id)" "\"workspace_id\":\"$WS_ID\"" "$DIAG_BODY"
|
||||
assert_contains "GET /terminal/diagnose (has steps[])" '"steps"' "$DIAG_BODY"
|
||||
# Failure mode: no bearer → 401 (WorkspaceAuth strict once a token exists).
|
||||
assert_contains "GET /terminal/diagnose (no auth → 401)" "401" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose")"
|
||||
|
||||
# ===========================================================================
|
||||
# 2. Webhooks (public) — POST /webhooks/:type
|
||||
# Public, no auth. telegram adapter: empty update body → (nil,nil) → 200
|
||||
# ignored; non-JSON → parse error → 400; unknown type → 404.
|
||||
# ===========================================================================
|
||||
echo "--- /webhooks/:type ---"
|
||||
BC=$(body_and_code POST "$BASE/webhooks/telegram" -H "Content-Type: application/json" -d '{}')
|
||||
WH_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
WH_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
assert_contains "POST /webhooks/telegram (non-message update → 200)" "200" "$WH_CODE"
|
||||
assert_contains "POST /webhooks/telegram (status ignored)" '"status":"ignored"' "$WH_BODY"
|
||||
assert_contains "POST /webhooks/telegram (bad JSON → 400)" "400" \
|
||||
"$(http_code POST "$BASE/webhooks/telegram" -H 'Content-Type: application/json' -d 'not-json')"
|
||||
assert_contains "POST /webhooks/<unknown> (→ 404)" "404" \
|
||||
"$(http_code POST "$BASE/webhooks/nope-not-a-channel" -H 'Content-Type: application/json' -d '{}')"
|
||||
|
||||
# ===========================================================================
|
||||
# 3. Budget — GET /workspaces/:id/budget (wsAuth) + PATCH (admin)
|
||||
# GET: fresh workspace → multi-period view, no limits, zero spend.
|
||||
# PATCH: set monthly limit (admin) → reflected; bad input → 400.
|
||||
# ===========================================================================
|
||||
echo "--- /budget ---"
|
||||
BUD=$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")
|
||||
assert_contains "GET /budget (has periods map)" '"periods"' "$BUD"
|
||||
assert_contains "GET /budget (monthly_spend 0 on fresh ws)" '"monthly_spend":0' "$BUD"
|
||||
# PATCH is admin-gated (router.go:419). Set a monthly limit and verify echo.
|
||||
PB=$(curl -s -X PATCH "$BASE/workspaces/$WS_ID/budget" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
|
||||
-d '{"budget_limits":{"monthly":2000}}')
|
||||
assert_contains "PATCH /budget (monthly limit set → echoed)" '"budget_limit":2000' "$PB"
|
||||
# Re-read confirms persistence.
|
||||
assert_contains "GET /budget (limit persisted)" '"budget_limit":2000' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")"
|
||||
# Failure: empty body → 400 "budget_limits or budget_limit field is required".
|
||||
assert_contains "PATCH /budget (empty body → 400)" "400" \
|
||||
"$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
|
||||
# Failure: unknown period → 400.
|
||||
assert_contains "PATCH /budget (unknown period → 400)" "400" \
|
||||
"$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"budget_limits":{"yearly":1}}')"
|
||||
# Failure: GET without bearer → 401.
|
||||
assert_contains "GET /budget (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/budget")"
|
||||
|
||||
# ===========================================================================
|
||||
# 4. Checkpoints — POST/GET/DELETE /workspaces/:id/checkpoints* (wsAuth)
|
||||
# Fully self-contained CRUD over workflow_checkpoints (#788). Upsert → latest
|
||||
# → list-by-wfid → delete → 404. Failure modes: missing workflow_id → 400,
|
||||
# empty latest → 404.
|
||||
# ===========================================================================
|
||||
echo "--- /checkpoints ---"
|
||||
WFID="kl-wf-$$"
|
||||
CP=$(curl -s -X POST "$BASE/workspaces/$WS_ID/checkpoints" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d "{\"workflow_id\":\"$WFID\",\"step_name\":\"step-a\",\"step_index\":1,\"payload\":{\"k\":\"v\"}}")
|
||||
assert_contains "POST /checkpoints (upsert → id + workflow_id)" "\"workflow_id\":\"$WFID\"" "$CP"
|
||||
assert_contains "GET /checkpoints/latest (200 newest)" "\"workflow_id\":\"$WFID\"" \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/latest" "${AUTH[@]}")"
|
||||
assert_contains "GET /checkpoints/:wfid (lists the step)" '"step_name":"step-a"' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
|
||||
DEL=$(curl -s -X DELETE "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")
|
||||
assert_contains "DELETE /checkpoints/:wfid (deleted count)" '"deleted":1' "$DEL"
|
||||
assert_contains "GET /checkpoints/:wfid (after delete → 404)" "404" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
|
||||
# Failure: missing workflow_id → 400 (binding:required).
|
||||
assert_contains "POST /checkpoints (missing workflow_id → 400)" "400" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' "${AUTH[@]}" -d '{"step_name":"x"}')"
|
||||
# Failure: no bearer → 401.
|
||||
assert_contains "POST /checkpoints (no auth → 401)" "401" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' -d '{"workflow_id":"x","step_name":"y"}')"
|
||||
|
||||
# ===========================================================================
|
||||
# 5. Audit — GET /workspaces/:id/audit (wsAuth)
|
||||
# EU AI Act ledger query (#594). Fresh ws → empty events, total 0,
|
||||
# chain_valid null (AUDIT_LEDGER_SALT unset). Failure: bad RFC3339 from → 400.
|
||||
# ===========================================================================
|
||||
echo "--- /audit ---"
|
||||
AUD=$(curl -s "$BASE/workspaces/$WS_ID/audit" "${AUTH[@]}")
|
||||
assert_contains "GET /audit (total 0 on fresh ws)" '"total":0' "$AUD"
|
||||
assert_contains "GET /audit (chain_valid null without salt)" '"chain_valid":null' "$AUD"
|
||||
assert_contains "GET /audit (bad 'from' → 400)" "400" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/audit?from=not-a-date" "${AUTH[@]}")"
|
||||
assert_contains "GET /audit (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/audit")"
|
||||
|
||||
# ===========================================================================
|
||||
# 6. Traces — GET /workspaces/:id/traces (wsAuth)
|
||||
# Langfuse proxy (#590). No LANGFUSE_* configured → 200 [] (graceful empty),
|
||||
# never a 5xx. Failure: no auth → 401.
|
||||
# ===========================================================================
|
||||
echo "--- /traces ---"
|
||||
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/traces" "${AUTH[@]}")
|
||||
TR_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
TR_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
assert_contains "GET /traces (200 without Langfuse)" "200" "$TR_CODE"
|
||||
assert_contains "GET /traces (empty list)" '[]' "$TR_BODY"
|
||||
assert_contains "GET /traces (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/traces")"
|
||||
|
||||
# ===========================================================================
|
||||
# 7. Session search — GET /workspaces/:id/session-search (wsAuth)
|
||||
# Searches activity_logs. Seed one activity row, then assert q-filter finds
|
||||
# it and a non-matching q returns []. Failure: no auth → 401.
|
||||
# ===========================================================================
|
||||
echo "--- /session-search ---"
|
||||
curl -s -X POST "$BASE/workspaces/$WS_ID/activity" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d '{"activity_type":"agent_log","method":"inference","summary":"keyless-needle marker"}' >/dev/null
|
||||
assert_contains "GET /session-search?q=keyless-needle (finds row)" 'keyless-needle' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=keyless-needle" "${AUTH[@]}")"
|
||||
assert_contains "GET /session-search?q=<no-match> (empty)" '[]' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=zzz-no-such-token-zzz" "${AUTH[@]}")"
|
||||
assert_contains "GET /session-search (no auth → 401)" "401" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/session-search?q=x")"
|
||||
|
||||
# ===========================================================================
|
||||
# 8. Rescue — GET /workspaces/:id/rescue (wsAuth)
|
||||
# RFC internal#742. Fail-CLOSED contract: the e2e-api job has no
|
||||
# MOLECULE_ORG_ID, so the handler returns 503 platform_misconfigured rather
|
||||
# than leaking cross-org. That fail-closed behaviour IS the keyless contract
|
||||
# we gate here (a regression that drops the org guard would flip this to a
|
||||
# 200/404 and turn this assertion RED). Failure mode: no auth → 401.
|
||||
# ===========================================================================
|
||||
echo "--- /rescue ---"
|
||||
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/rescue" "${AUTH[@]}")
|
||||
RES_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
RES_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
if [ "$RES_CODE" = "404" ]; then
|
||||
# MOLECULE_ORG_ID was set in this environment → no-bundle path.
|
||||
assert_contains "GET /rescue (no bundle → 404, org configured)" 'no rescue bundle' "$RES_BODY"
|
||||
else
|
||||
# No MOLECULE_ORG_ID (the e2e-api default) → fail-closed 503.
|
||||
assert_contains "GET /rescue (fail-closed 503 without MOLECULE_ORG_ID)" "503" "$RES_CODE"
|
||||
assert_contains "GET /rescue (platform_misconfigured code)" 'platform_misconfigured' "$RES_BODY"
|
||||
fi
|
||||
assert_contains "GET /rescue (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/rescue")"
|
||||
|
||||
# ===========================================================================
|
||||
# 9. LLM billing-mode admin toggle — GET/PUT /admin/workspaces/:id/llm-billing-mode
|
||||
# (AdminAuth). Flip to byok → read back override; bad UUID → 400; missing
|
||||
# 'mode' key → 400; unknown mode → 400.
|
||||
# ===========================================================================
|
||||
echo "--- /admin/workspaces/:id/llm-billing-mode ---"
|
||||
assert_contains "GET llm-billing-mode (resolves a mode)" '"resolved_mode"' \
|
||||
"$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
|
||||
PUTBM=$(curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
|
||||
-d '{"mode":"byok"}')
|
||||
assert_contains "PUT llm-billing-mode byok (override set)" '"workspace_override":"byok"' "$PUTBM"
|
||||
assert_contains "GET llm-billing-mode (byok persisted)" '"workspace_override":"byok"' \
|
||||
"$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
|
||||
# Clear the override (null) so we don't leave fixture state skewed.
|
||||
curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
|
||||
-d '{"mode":null}' >/dev/null
|
||||
# Failure: malformed UUID → 400.
|
||||
assert_contains "PUT llm-billing-mode (bad UUID → 400)" "400" \
|
||||
"$(http_code PUT "$BASE/admin/workspaces/not-a-uuid/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"byok"}')"
|
||||
# Failure: missing 'mode' key → 400.
|
||||
assert_contains "PUT llm-billing-mode (missing mode → 400)" "400" \
|
||||
"$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
|
||||
# Failure: unknown mode string → 400.
|
||||
assert_contains "PUT llm-billing-mode (unknown mode → 400)" "400" \
|
||||
"$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"bogus-mode"}')"
|
||||
|
||||
# ===========================================================================
|
||||
# 10. Lifecycle — Pause → Resume + Hibernate (wsAuth)
|
||||
# Pause works backend-agnostically (StopWorkspaceAuto no-ops on no backend)
|
||||
# → status=paused. Resume re-provisions: 200 provisioning when a provisioner
|
||||
# is wired (the e2e-api host has Docker), or 503 provisioner-not-available
|
||||
# otherwise — both are valid contracts, so accept either. Failure modes:
|
||||
# resume a non-paused ws → 404; hibernate a non-online ws → 404.
|
||||
# ===========================================================================
|
||||
echo "--- lifecycle (resume / hibernate) ---"
|
||||
# Pause the (online) fixture → status paused.
|
||||
PA=$(curl -s -X POST "$BASE/workspaces/$WS_ID/pause" "${AUTH[@]}")
|
||||
assert_contains "POST /pause (online → paused)" '"status":"paused"' "$PA"
|
||||
# Resume the paused fixture — accept 200 provisioning OR 503 (no provisioner).
|
||||
BC=$(body_and_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")
|
||||
RSM_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
RSM_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
if [ "$RSM_CODE" = "200" ]; then
|
||||
assert_contains "POST /resume (paused → provisioning)" '"status":"provisioning"' "$RSM_BODY"
|
||||
elif [ "$RSM_CODE" = "503" ]; then
|
||||
assert_contains "POST /resume (no provisioner → 503 contract)" 'provisioner not available' "$RSM_BODY"
|
||||
else
|
||||
fail "POST /resume (expected 200 or 503)" "got HTTP $RSM_CODE — $RSM_BODY"
|
||||
fi
|
||||
# Failure: resume a workspace that is NOT paused → 404.
|
||||
# (After the resume above it is provisioning/online, not paused.)
|
||||
assert_contains "POST /resume (not-paused → 404)" "404" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")"
|
||||
# Hibernate: bring the fixture back online first, then hibernate it.
|
||||
curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
|
||||
HB=$(curl -s -X POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")
|
||||
assert_contains "POST /hibernate (online → hibernated)" '"status":"hibernated"' "$HB"
|
||||
# Failure: hibernate again (now hibernated, not online/degraded) → 404.
|
||||
assert_contains "POST /hibernate (not-hibernatable → 404)" "404" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")"
|
||||
# Failure: no bearer → 401.
|
||||
assert_contains "POST /resume (no auth → 401)" "401" "$(http_code POST "$BASE/workspaces/$WS_ID/resume")"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cleanup — delete the fixture (admin-gated DELETE + per-workspace bearer).
|
||||
# ---------------------------------------------------------------------------
|
||||
e2e_delete_workspace "$WS_ID" "Keyless Fixture" "${ADMIN_AUTH[@]}"
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -49,13 +49,13 @@ run_test "codex → slash-form fallback" codex
|
||||
run_test "claude-code → OAuth/default alias" claude-code "sonnet"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code)
|
||||
assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "minimax:MiniMax-M2.7"
|
||||
assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "MiniMax-M2"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code)
|
||||
assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code)
|
||||
assert_eq "claude-code + both keys → MiniMax priority" "$got" "minimax:MiniMax-M2.7"
|
||||
assert_eq "claude-code + both keys → MiniMax priority" "$got" "MiniMax-M2"
|
||||
|
||||
# ── Fallback for unknown runtime ──
|
||||
# Picks slash-form (hermes-shaped) since hermes is the historical
|
||||
@@ -65,28 +65,6 @@ assert_eq "claude-code + both keys → MiniMax priority" "$got" "mini
|
||||
run_test "unknown runtime → slash-form fallback" gemini "openai/gpt-4o"
|
||||
run_test "empty runtime → slash-form fallback" "" "openai/gpt-4o"
|
||||
|
||||
# ── Platform-managed path (E2E_LLM_PATH=platform) ──
|
||||
# The moonshot/kimi NOT_CONFIGURED regression path (RFC#340 Fix A #2187).
|
||||
# Selects the slash-namespaced platform model (default moonshot/kimi-k2.6),
|
||||
# takes precedence over the per-key BYOK branches, and is itself overridden by
|
||||
# E2E_MODEL_SLUG. These pins guard the harness's ability to drive the platform
|
||||
# arm — the one the prod bug shipped on.
|
||||
echo
|
||||
echo "Test: pick_model_slug — platform-managed path (E2E_LLM_PATH=platform)"
|
||||
echo
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG E2E_DEFAULT_PLATFORM_MODEL; E2E_LLM_PATH=platform pick_model_slug claude-code)
|
||||
assert_eq "claude-code + platform path → headline kimi model" "$got" "moonshot/kimi-k2.6"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG E2E_DEFAULT_PLATFORM_MODEL; E2E_LLM_PATH=platform E2E_MINIMAX_API_KEY="mx-stray" pick_model_slug claude-code)
|
||||
assert_eq "platform path beats a stray BYOK key (no mask)" "$got" "moonshot/kimi-k2.6"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG; E2E_LLM_PATH=platform E2E_DEFAULT_PLATFORM_MODEL="minimax/MiniMax-M3" pick_model_slug claude-code)
|
||||
assert_eq "platform path honours E2E_DEFAULT_PLATFORM_MODEL" "$got" "minimax/MiniMax-M3"
|
||||
|
||||
got=$(unset E2E_DEFAULT_PLATFORM_MODEL; E2E_MODEL_SLUG="anthropic/claude-opus-4-7" E2E_LLM_PATH=platform pick_model_slug claude-code)
|
||||
assert_eq "E2E_MODEL_SLUG still wins over platform path" "$got" "anthropic/claude-opus-4-7"
|
||||
|
||||
# ── Override via E2E_MODEL_SLUG ──
|
||||
# When the operator sets E2E_MODEL_SLUG, the per-runtime dispatch is
|
||||
# bypassed. Used during workflow_dispatch to A/B specific slugs.
|
||||
|
||||
@@ -28,13 +28,6 @@ PASS=0
|
||||
FAIL=0
|
||||
WSID=""
|
||||
|
||||
# GET /workspaces (list) and POST /workspaces (create) are AdminAuth-gated
|
||||
# (router.go:165-166). The e2e-api CI job sets ADMIN_TOKEN on the platform
|
||||
# (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so these calls need the
|
||||
# admin bearer. Guarded if-set so a fail-open dev platform still works.
|
||||
ADMIN_AUTH=()
|
||||
e2e_admin_auth_args ADMIN_AUTH
|
||||
|
||||
cleanup() {
|
||||
# Workspace teardown — best-effort, ignore errors so an unrelated CP
|
||||
# outage doesn't shadow a real test failure.
|
||||
@@ -87,7 +80,7 @@ echo "=== Setup ==="
|
||||
# canvas. Find and delete any with this exact name so the test is safe to
|
||||
# re-run from any state. Match by name (not tag) so this also catches
|
||||
# leftovers created by older script versions.
|
||||
PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c '
|
||||
PRIOR=$(curl -s "$BASE/workspaces" | python3 -c '
|
||||
import json, sys
|
||||
try:
|
||||
print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name") == "Notify E2E"))
|
||||
@@ -103,7 +96,7 @@ done
|
||||
# feedback_workspace_model_required_no_platform_default_dynamic_credential_intake).
|
||||
# Body has no runtime → defaults to claude-code; pass the matching model
|
||||
# that the workspace-creation contract now requires.
|
||||
R=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
|
||||
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
|
||||
-d '{"name":"Notify E2E","tier":1,"runtime":"external","external":true,"model":"sonnet"}')
|
||||
WSID=$(echo "$R" | python3 -c 'import json,sys;print(json.load(sys.stdin)["id"])' 2>/dev/null || true)
|
||||
[ -n "$WSID" ] || { echo "Failed to create workspace: $R"; exit 1; }
|
||||
|
||||
@@ -234,30 +234,9 @@ elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_OPENAI_API_KEY'];print(json.dumps({'OPENAI_API_KEY':k,'OPENAI_BASE_URL':'https://api.openai.com/v1','MODEL_PROVIDER':'openai:gpt-4o','HERMES_INFERENCE_PROVIDER':'custom','HERMES_CUSTOM_BASE_URL':'https://api.openai.com/v1','HERMES_CUSTOM_API_KEY':k,'HERMES_CUSTOM_API_MODE':'chat_completions'}))")
|
||||
fi
|
||||
|
||||
# Workspace-create now enforces the MODEL_REQUIRED contract: there is NO
|
||||
# platform-side default model for a runtime (feedback_workspace_model_required_
|
||||
# no_platform_default). Every create MUST carry an explicit `model`, or the CP
|
||||
# rejects it with MODEL_REQUIRED before this gate's peer-visibility assertion
|
||||
# can run. We pick a PLATFORM-MANAGED id (Molecule owns billing — no tenant key
|
||||
# needed; this gate only needs the workspace to boot + list peers, not heavy
|
||||
# LLM work), validated against the controlplane providers SSOT
|
||||
# (internal/providers/providers.yaml runtimes.<rt>.providers[platform].models):
|
||||
# claude-code → anthropic/claude-sonnet-4-6 (platform claude model)
|
||||
# hermes/openclaw → moonshot/kimi-k2.6 (their only platform family)
|
||||
# E2E_MODEL_SLUG overrides for operator-dispatched runs.
|
||||
pv_platform_model_for_runtime() {
|
||||
if [ -n "${E2E_MODEL_SLUG:-}" ]; then printf '%s' "$E2E_MODEL_SLUG"; return 0; fi
|
||||
case "$1" in
|
||||
claude-code) printf 'anthropic/claude-sonnet-4-6' ;;
|
||||
hermes|openclaw) printf 'moonshot/kimi-k2.6' ;;
|
||||
*) printf 'moonshot/kimi-k2.6' ;;
|
||||
esac
|
||||
}
|
||||
|
||||
log "4/6 provisioning parent (claude-code) + one sibling per runtime under test..."
|
||||
PARENT_MODEL=$(pv_platform_model_for_runtime claude-code)
|
||||
P_RESP=$(tenant_call POST /workspaces \
|
||||
-d "{\"name\":\"pv-parent\",\"runtime\":\"claude-code\",\"model\":\"$PARENT_MODEL\",\"tier\":3,\"secrets\":$SECRETS_JSON}")
|
||||
-d "{\"name\":\"pv-parent\",\"runtime\":\"claude-code\",\"tier\":3,\"secrets\":$SECRETS_JSON}")
|
||||
PARENT_ID=$(echo "$P_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
|
||||
[ -n "$PARENT_ID" ] || fail "parent create failed: $(echo "$P_RESP" | head -c 300)"
|
||||
log " PARENT_ID=$PARENT_ID"
|
||||
@@ -266,9 +245,8 @@ log " PARENT_ID=$PARENT_ID"
|
||||
declare -A WS_IDS WS_TOKENS
|
||||
ALL_WS_IDS="$PARENT_ID"
|
||||
for rt in $PV_RUNTIMES; do
|
||||
RT_MODEL=$(pv_platform_model_for_runtime "$rt")
|
||||
R=$(tenant_call POST /workspaces \
|
||||
-d "{\"name\":\"pv-$rt\",\"runtime\":\"$rt\",\"model\":\"$RT_MODEL\",\"tier\":2,\"parent_id\":\"$PARENT_ID\",\"secrets\":$SECRETS_JSON}")
|
||||
-d "{\"name\":\"pv-$rt\",\"runtime\":\"$rt\",\"tier\":2,\"parent_id\":\"$PARENT_ID\",\"secrets\":$SECRETS_JSON}")
|
||||
WID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
|
||||
WTOK=$(echo "$R" | extract_auth_token)
|
||||
[ -n "$WID" ] || fail "$rt workspace create failed: $(printf '%s' "$R" | head -c 300)"
|
||||
|
||||
@@ -300,14 +300,7 @@ rows = json.load(sys.stdin)
|
||||
def text_of(r):
|
||||
body = r.get('request_body') or {}
|
||||
parts = (body.get('params') or {}).get('message', {}).get('parts') or []
|
||||
# A2A v0.3 keys the Part discriminator on 'kind'; legacy senders used
|
||||
# 'type'. ProxyA2A.normalizeA2APayload (#2251) rewrites 'type' -> 'kind'
|
||||
# on ingest, so the stored request_body carries 'kind' even when the
|
||||
# caller posted 'type'. Accept EITHER so this parser asserts on the text
|
||||
# payload, not on which discriminator field the server happened to store.
|
||||
def is_text(p):
|
||||
return p.get('kind') == 'text' or p.get('type') == 'text'
|
||||
return ''.join(p.get('text', '') for p in parts if is_text(p))
|
||||
return ''.join(p.get('text','') for p in parts if p.get('type')=='text')
|
||||
if len(rows) < 2:
|
||||
print('NEED2_GOT_'+str(len(rows)))
|
||||
else:
|
||||
@@ -316,29 +309,6 @@ else:
|
||||
check_eq "since_id feed orders ASC (oldest-new first, newest-new last)" \
|
||||
"hello-from-e2e-2|hello-from-e2e-3" "$ASC_FIRST"
|
||||
|
||||
# Wire-contract gate (#2251): the caller posted parts with the LEGACY "type"
|
||||
# discriminator, but ProxyA2A.normalizeA2APayload rewrites "type" -> "kind"
|
||||
# (A2A v0.3) BEFORE the row is durably logged. Assert the stored request_body
|
||||
# carries "kind" and no longer carries "type", so a regression that drops the
|
||||
# rename — or a feed that stops storing the normalized body — fails loudly here
|
||||
# instead of silently feeding the polling agent an untagged Part. This is the
|
||||
# end-to-end half of the Go unit tests in a2a_proxy_test.go (which assert the
|
||||
# rename in isolation); this proves it survives the durable activity_logs path.
|
||||
DISC=$(echo "$ASC_RESP" | python3 -c "
|
||||
import json, sys
|
||||
rows = json.load(sys.stdin)
|
||||
kinds, types = [], []
|
||||
for r in rows:
|
||||
body = r.get('request_body') or {}
|
||||
parts = (body.get('params') or {}).get('message', {}).get('parts') or []
|
||||
for p in parts:
|
||||
if 'kind' in p: kinds.append(p['kind'])
|
||||
if 'type' in p: types.append(p['type'])
|
||||
print(('kind' if kinds and not types else 'BAD') + ':' + ','.join(kinds) + '/' + ','.join(types))
|
||||
")
|
||||
check_eq "stored Part uses v0.3 'kind' discriminator, never legacy 'type' (#2251)" \
|
||||
"kind:text,text/" "$DISC"
|
||||
|
||||
# ---------- Phase 6: stale cursor returns 410 ----------
|
||||
echo ""
|
||||
echo "--- Phase 6: Stale / unknown cursor returns 410 ---"
|
||||
|
||||
@@ -24,73 +24,11 @@
|
||||
# Each phase skips cleanly when its prerequisite secret is absent so a
|
||||
# partially-keyed env (e.g. CI without an OpenAI key) doesn't false-fail.
|
||||
#
|
||||
# REQUIRE-LIVE (false-green guard, mirrors CP serving-e2e's
|
||||
# SERVING_E2E_REQUIRE_LIVE semantics)
|
||||
# ------------------------------------------------------------------
|
||||
# Without a guard, an env with NO live secrets makes every phase SKIP,
|
||||
# leaving PASS=0 FAIL=0 — and the historical `[ "$FAIL" -eq 0 ]` gate
|
||||
# exits 0 (GREEN) while validating ZERO runtimes. That made the REQUIRED
|
||||
# `E2E API Smoke Test` merge gate pass without exercising a single
|
||||
# runtime (false-green).
|
||||
#
|
||||
# Fix: a real "validated arm" counter (VALIDATED) tracks runtimes that
|
||||
# actually ran AND produced a non-error A2A reply. With E2E_REQUIRE_LIVE=1:
|
||||
# if zero arms validated, the run exits NON-zero with a loud message.
|
||||
# Without it (E2E_REQUIRE_LIVE unset/0), a fully-skipped run stays a LOUD
|
||||
# skip + exit 0 for dev convenience.
|
||||
#
|
||||
# This zero-validated→RED decision is the load-bearing logic. It is factored
|
||||
# into evaluate_require_live_gate() (a pure function of $FAIL/$VALIDATED/
|
||||
# $E2E_REQUIRE_LIVE, defined before any platform I/O) and is REGRESSION-GATED
|
||||
# on every PR by tests/e2e/test_require_live_priority_gate_unit.sh, which
|
||||
# sources this file (E2E_PRIORITY_UNIT_SOURCE=1), sets the counters, and
|
||||
# asserts the gate's exit code — no platform, no provisioning, no network.
|
||||
# So the false-green can't silently come back: a revert of the guard fails CI.
|
||||
#
|
||||
# CI POSTURE (REQUIRE-LIVE ON — see .gitea/workflows/e2e-api.yml):
|
||||
# The live e2e-api job SETS E2E_REQUIRE_LIVE=1. The `mock` arm is the
|
||||
# CI-provisionable live-completion arm: it org-imports a mock workspace
|
||||
# (→online→canned A2A reply) with NO external secret. The only thing that
|
||||
# previously blocked it in CI was admin auth — POST /org/import and POST
|
||||
# /admin/workspaces/:id/tokens are AdminAuth-gated, and the job set no admin
|
||||
# token, so every admin call 401'd ("admin auth required"). The job now sets
|
||||
# ADMIN_TOKEN on the platform AND exports the matching MOLECULE_ADMIN_TOKEN
|
||||
# the scripts send, so mock validates end-to-end and VALIDATED>=1 holds on a
|
||||
# healthy platform — the REQUIRED `E2E API Smoke Test` gate now HONESTLY
|
||||
# validates a runtime. If the mock plumbing or the admin-auth wiring breaks,
|
||||
# the gate goes RED (not false-green). The zero-validated→RED decision is also
|
||||
# regression-gated WITHOUT provisioning by the bash unit test above, so a
|
||||
# revert of that logic still fails CI.
|
||||
#
|
||||
# LIVE ARMS (run when their prerequisite is present; opportunistic):
|
||||
# - `mock` (run_mock) is the no-key REQUIRE-LIVE backbone: a virtual
|
||||
# workspace (no container, no EC2, no provider) whose org-import path
|
||||
# short-circuits to status='online' with a canned A2A reply. It validates
|
||||
# in CI now that the e2e-api job wires an admin token (org-import + token
|
||||
# mint are AdminAuth-gated), so it is the guaranteed >=1 validation.
|
||||
# - MiniMax (E2E_MINIMAX_API_KEY, from MOLECULE_STAGING_MINIMAX_API_KEY) is
|
||||
# an OPPORTUNISTIC best-effort real-LLM arm: registry-fragile in CI (422
|
||||
# UNREGISTERED_MODEL_FOR_RUNTIME — see run_minimax header), so a miss is
|
||||
# a best-effort MISS via bestfail() and does NOT red the gate.
|
||||
# The CI e2e-api job sets E2E_REQUIRE_LIVE=1: mock guarantees a validation, so
|
||||
# the REQUIRED gate is honest (RED if the mock plumbing/admin-auth breaks). The
|
||||
# zero-validated→RED logic is also regression-gated by the bash unit test above.
|
||||
#
|
||||
# Usage:
|
||||
# # Enforce REQUIRE-LIVE locally (need >=1 arm to actually validate):
|
||||
# E2E_REQUIRE_LIVE=1 E2E_MINIMAX_API_KEY=... \
|
||||
# tests/e2e/test_priority_runtimes_e2e.sh
|
||||
#
|
||||
# # Default (no enforcement): all-skip stays a LOUD skip + exit 0:
|
||||
# tests/e2e/test_priority_runtimes_e2e.sh
|
||||
#
|
||||
# # Other live arms (if their secrets are configured):
|
||||
# CLAUDE_CODE_OAUTH_TOKEN=... E2E_OPENAI_API_KEY=... \
|
||||
# tests/e2e/test_priority_runtimes_e2e.sh
|
||||
#
|
||||
# # Run only one runtime
|
||||
# E2E_RUNTIMES=mock tests/e2e/test_priority_runtimes_e2e.sh
|
||||
# E2E_RUNTIMES=minimax tests/e2e/test_priority_runtimes_e2e.sh
|
||||
# E2E_RUNTIMES=claude-code tests/e2e/test_priority_runtimes_e2e.sh
|
||||
# E2E_RUNTIMES=hermes tests/e2e/test_priority_runtimes_e2e.sh
|
||||
#
|
||||
@@ -103,81 +41,13 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
source "$(dirname "$0")/_lib.sh"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
SKIP=0
|
||||
# VALIDATED counts runtimes that ACTUALLY ran end-to-end (provisioned,
|
||||
# reached online, AND returned a non-error A2A reply). Distinct from PASS,
|
||||
# which also counts sub-assertions like activity-log rows. This is the
|
||||
# signal the REQUIRE-LIVE gate keys off: VALIDATED==0 means we proved
|
||||
# nothing about any runtime, regardless of how many sub-asserts "passed".
|
||||
VALIDATED=0
|
||||
CREATED_WSIDS=()
|
||||
|
||||
# evaluate_require_live_gate — the SINGLE source of the final exit decision.
|
||||
# Pure function of $FAIL, $VALIDATED, and $E2E_REQUIRE_LIVE; performs NO I/O
|
||||
# beyond the loud messages. Returns the exit code the script should exit with:
|
||||
# - FAIL>0 → 1 (a real failure is always red)
|
||||
# - VALIDATED==0 + REQUIRE_LIVE → 1 (false-green trap: proved nothing → RED)
|
||||
# - VALIDATED==0 + !REQUIRE_LIVE → 0 (dev-convenience LOUD skip)
|
||||
# - VALIDATED>=1 → 0 (at least one arm validated end-to-end)
|
||||
# It is a function (not inline tail code) so test_require_live_priority_gate_unit.sh
|
||||
# can drive the REAL decision in isolation — set the counters, call this, assert
|
||||
# the return code — with no platform, no provisioning, no network. That makes the
|
||||
# zero-validated→RED logic a CI-gated regression contract: a future revert of it
|
||||
# fails the unit test on every PR. See that unit test for the fail-direction proof.
|
||||
evaluate_require_live_gate() {
|
||||
# Any real failure is always red.
|
||||
if [ "$FAIL" -ne 0 ]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
# REQUIRE-LIVE gate (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE).
|
||||
# A run where every runtime SKIPPED proves nothing. In enforced mode
|
||||
# (E2E_REQUIRE_LIVE=1) that MUST be red so the required `E2E API Smoke
|
||||
# Test` gate can't be false-green on an all-skip run.
|
||||
local require_live="${E2E_REQUIRE_LIVE:-0}"
|
||||
if [ "$VALIDATED" -eq 0 ]; then
|
||||
if [ "$require_live" = "1" ] || [ "$require_live" = "true" ]; then
|
||||
echo "::error::E2E_REQUIRE_LIVE is set but ZERO runtimes were validated end-to-end." >&2
|
||||
echo " Every runtime SKIPPED — no live secret was present, so this gate" >&2
|
||||
echo " validated nothing. Wire at least one live arm via Gitea secrets" >&2
|
||||
echo " (E2E_MINIMAX_API_KEY ← MOLECULE_STAGING_MINIMAX_API_KEY is the" >&2
|
||||
echo " default CI arm; CLAUDE_CODE_OAUTH_TOKEN / E2E_OPENAI_API_KEY also" >&2
|
||||
echo " work) so >=1 runtime actually provisions + replies. Failing RED" >&2
|
||||
echo " instead of false-green." >&2
|
||||
return 1
|
||||
fi
|
||||
# Dev convenience: no enforcement requested → loud skip, exit 0.
|
||||
echo "SKIPPED: no live secrets present and E2E_REQUIRE_LIVE is not set — validated" >&2
|
||||
echo " zero runtimes. This is a dev-convenience pass; CI sets" >&2
|
||||
echo " E2E_REQUIRE_LIVE=1 to make zero-validated a hard failure." >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "OK: $VALIDATED runtime(s) validated end-to-end."
|
||||
return 0
|
||||
}
|
||||
|
||||
# Source-guard: when sourced by the unit test (E2E_PRIORITY_UNIT_SOURCE=1) we
|
||||
# stop HERE — the counters + evaluate_require_live_gate are now defined, and we
|
||||
# must NOT fall through to _lib.sh's platform-dependent helpers or the live
|
||||
# pre-sweep curl below (there is no platform in the unit-test environment).
|
||||
if [ "${E2E_PRIORITY_UNIT_SOURCE:-0}" = "1" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
source "$(dirname "$0")/_lib.sh"
|
||||
|
||||
# GET /workspaces (list, router.go:165) and POST /workspaces (create,
|
||||
# router.go:166) are AdminAuth-gated. The e2e-api CI job sets ADMIN_TOKEN on the
|
||||
# platform (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so the
|
||||
# pre-sweep list and every runtime-create must send the admin bearer or they
|
||||
# 401. run_mock uses POST /org/import (also admin-gated) and wires its own admin
|
||||
# auth inline. Guarded if-set so a fail-open dev platform still works.
|
||||
ADMIN_AUTH=()
|
||||
e2e_admin_auth_args ADMIN_AUTH
|
||||
|
||||
cleanup() {
|
||||
# `set -u` + empty array would error on "${CREATED_WSIDS[@]}"; the
|
||||
# ${VAR[@]+"…"} form expands to nothing when the array is unset/empty
|
||||
@@ -188,26 +58,14 @@ cleanup() {
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
pass() { echo " PASS — $1"; PASS=$((PASS + 1)); }
|
||||
fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); }
|
||||
skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); }
|
||||
# Mark a runtime as having been validated end-to-end (online + non-error
|
||||
# A2A reply). Also emits a PASS line so it shows in the results tally.
|
||||
validated() { echo " PASS — $1"; PASS=$((PASS + 1)); VALIDATED=$((VALIDATED + 1)); }
|
||||
# bestfail() is for OPPORTUNISTIC (best-effort) arms whose failure must
|
||||
# NOT red the gate. It does NOT increment FAIL — it only logs + bumps
|
||||
# SKIP so the tally stays honest ("we tried, it didn't validate, but it
|
||||
# was never load-bearing"). Used by the MiniMax arm: MiniMax-create is
|
||||
# fragile in CI (registry-skewed model id, BYOK plumbing — see core#2263
|
||||
# and the run_minimax header), so a MiniMax miss is reported but never
|
||||
# fails the REQUIRED gate. The mock arm is the load-bearing validation
|
||||
# that keeps the gate honest; MiniMax is the real-LLM bonus on top.
|
||||
bestfail() { echo " BEST-EFFORT MISS — $1"; echo " $2"; SKIP=$((SKIP + 1)); }
|
||||
pass() { echo " PASS — $1"; PASS=$((PASS + 1)); }
|
||||
fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); }
|
||||
skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); }
|
||||
|
||||
# Pre-sweep any prior runs that left workspaces behind (same defence as
|
||||
# test_notify_attachments_e2e.sh: trap fires on normal exit, but a
|
||||
# SIGPIPE / kill -9 can bypass it).
|
||||
PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c '
|
||||
PRIOR=$(curl -s "$BASE/workspaces" | python3 -c '
|
||||
import json, sys
|
||||
try:
|
||||
print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name","").startswith("Priority E2E ")))
|
||||
@@ -330,7 +188,7 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN
|
||||
")
|
||||
local resp wsid
|
||||
# model required (CTO 2026-05-22 SSOT) — pass the deleted DefaultModel("claude-code") value.
|
||||
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
|
||||
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"Priority E2E (claude-code)\",\"runtime\":\"claude-code\",\"model\":\"sonnet\",\"tier\":1,\"secrets\":$secrets}")
|
||||
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
|
||||
if [ -z "$wsid" ]; then
|
||||
@@ -362,9 +220,9 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN
|
||||
local reply
|
||||
if reply=$(send_test_prompt "$wsid" "$token"); then
|
||||
if echo "$reply" | grep -q "PONG"; then
|
||||
validated "claude-code reply contains PONG"
|
||||
pass "claude-code reply contains PONG"
|
||||
else
|
||||
validated "claude-code reply non-empty (first 80 chars: ${reply:0:80})"
|
||||
pass "claude-code reply non-empty (first 80 chars: ${reply:0:80})"
|
||||
fi
|
||||
assert_activity_logged "claude-code" "$wsid" "$token"
|
||||
else
|
||||
@@ -396,7 +254,7 @@ print(json.dumps({
|
||||
}))
|
||||
")
|
||||
local resp wsid
|
||||
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
|
||||
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"Priority E2E (hermes)\",\"runtime\":\"hermes\",\"tier\":1,\"model\":\"openai/gpt-4o\",\"secrets\":$secrets}")
|
||||
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
|
||||
if [ -z "$wsid" ]; then
|
||||
@@ -430,9 +288,9 @@ print(json.dumps({
|
||||
local reply
|
||||
if reply=$(send_test_prompt "$wsid" "$token"); then
|
||||
if echo "$reply" | grep -q "PONG"; then
|
||||
validated "hermes reply contains PONG"
|
||||
pass "hermes reply contains PONG"
|
||||
else
|
||||
validated "hermes reply non-empty (first 80 chars: ${reply:0:80})"
|
||||
pass "hermes reply non-empty (first 80 chars: ${reply:0:80})"
|
||||
fi
|
||||
assert_activity_logged "hermes" "$wsid" "$token"
|
||||
else
|
||||
@@ -469,7 +327,7 @@ print(json.dumps({
|
||||
}))
|
||||
")
|
||||
local resp wsid
|
||||
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
|
||||
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"Priority E2E ($runtime)\",\"runtime\":\"$runtime\",\"tier\":1,\"model\":\"openai/gpt-4o-mini\",\"secrets\":$secrets}")
|
||||
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
|
||||
if [ -z "$wsid" ]; then
|
||||
@@ -500,9 +358,9 @@ print(json.dumps({
|
||||
local reply
|
||||
if reply=$(send_test_prompt "$wsid" "$token"); then
|
||||
if echo "$reply" | grep -q "PONG"; then
|
||||
validated "$runtime reply contains PONG"
|
||||
pass "$runtime reply contains PONG"
|
||||
else
|
||||
validated "$runtime reply non-empty (first 80 chars: ${reply:0:80})"
|
||||
pass "$runtime reply non-empty (first 80 chars: ${reply:0:80})"
|
||||
fi
|
||||
assert_activity_logged "$runtime" "$wsid" "$token"
|
||||
else
|
||||
@@ -513,253 +371,18 @@ print(json.dumps({
|
||||
run_codex() { run_openai_runtime "codex" "codex"; }
|
||||
run_openclaw() { run_openai_runtime "openclaw" "openclaw"; }
|
||||
|
||||
####################################################################
|
||||
# Mock arm — the GUARANTEED, always-available REQUIRE-LIVE backbone.
|
||||
####################################################################
|
||||
# The mock runtime (workspace-server/internal/handlers/mock_runtime.go)
|
||||
# is a virtual workspace: NO container, NO EC2, NO LLM key. The org-import
|
||||
# path (createWorkspaceTree, org_import.go) short-circuits a runtime=mock
|
||||
# workspace straight to status='online' (no provisioner needed), and the
|
||||
# A2A proxy (a2a_proxy.go → handleMockA2A) synthesises a deterministic
|
||||
# canned JSON-RPC reply with logActivity=true (writes the activity_logs
|
||||
# row too). That makes mock the perfect REQUIRE-LIVE backbone: it
|
||||
# exercises the SAME plumbing every real runtime needs to pass —
|
||||
# provision-decision → status=online → A2A round-trip → activity_logs —
|
||||
# without depending on any external provider key or LLM availability. It
|
||||
# is GREEN on a healthy platform and RED only if that plumbing genuinely
|
||||
# breaks (DB insert, status flip, A2A proxy, activity logging). No more
|
||||
# false-green (zero-validated is impossible when mock works), and no more
|
||||
# can't-go-green (mock needs no secret, so it always runs in CI).
|
||||
#
|
||||
# Why org-import (POST /org/import) instead of POST /workspaces:
|
||||
# The mock→online short-circuit lives ONLY in createWorkspaceTree
|
||||
# (org_import.go). The single-workspace Create handler (workspace.go)
|
||||
# has no mock branch — it routes runtime=mock through
|
||||
# provisionWorkspaceAuto, which in CI's local-build mode has no mock
|
||||
# image and would never reach online. Org-import is the supported path
|
||||
# to a live mock workspace, so the arm drives it.
|
||||
#
|
||||
# The canned reply is one of the "On it!" variants (NOT "PONG"), so this
|
||||
# arm validates on the non-empty / non-error branch — that is the real
|
||||
# contract for mock (it proves the plumbing, not an LLM's instruction-
|
||||
# following).
|
||||
run_mock() {
|
||||
echo ""
|
||||
echo "=== mock (no-key plumbing backbone) happy path ==="
|
||||
# No secret gate — mock ALWAYS runs. That is the whole point: it is the
|
||||
# required-validation arm that keeps E2E_REQUIRE_LIVE honest without a key.
|
||||
|
||||
# Inline single-workspace mock org. model is a required field on the
|
||||
# org-import contract (createWorkspaceTree fails-closed without one);
|
||||
# mock never USES the model, so any non-empty value satisfies the
|
||||
# contract. The org-import path does not run the Create handler's
|
||||
# registry model-validation, so "mock" is accepted as-is.
|
||||
# POST /org/import is AdminAuth-gated (router.go:778). When the platform has
|
||||
# ADMIN_TOKEN set (as the e2e-api CI job now does), an unauthenticated import
|
||||
# 401s with {"error":"admin auth required"}. Send the same admin bearer the
|
||||
# mint helper uses (MOLECULE_ADMIN_TOKEN, ADMIN_TOKEN fallback) — guarded so a
|
||||
# bootstrap/dev platform with no admin token (fail-open) still works.
|
||||
local admin_bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
|
||||
local admin_auth=()
|
||||
[ -n "$admin_bearer" ] && admin_auth=(-H "Authorization: Bearer $admin_bearer")
|
||||
local import_resp wsid
|
||||
import_resp=$(curl -s -X POST "$BASE/org/import" -H "Content-Type: application/json" \
|
||||
${admin_auth[@]+"${admin_auth[@]}"} \
|
||||
-d '{
|
||||
"template": {
|
||||
"name": "Priority E2E Mock Org",
|
||||
"defaults": {"runtime": "mock", "model": "mock", "tier": 1},
|
||||
"workspaces": [
|
||||
{"name": "Priority E2E (mock)", "runtime": "mock", "model": "mock", "tier": 1}
|
||||
]
|
||||
}
|
||||
}')
|
||||
# org-import returns {"org":..., "count":N, "workspaces":[{"id":...,
|
||||
# "name":...,"tier":...}, ...]} (handlers/org.go:898-901). Pull the id of
|
||||
# the single workspace we declared. (Older "results" key fallback kept for
|
||||
# forward/back compat in case the response shape is ever versioned.)
|
||||
wsid=$(echo "$import_resp" | python3 -c '
|
||||
import json, sys
|
||||
try:
|
||||
d = json.load(sys.stdin)
|
||||
except Exception:
|
||||
sys.exit(0)
|
||||
for r in (d.get("workspaces") or d.get("results") or []):
|
||||
if r.get("name") == "Priority E2E (mock)" and r.get("id"):
|
||||
print(r["id"]); break
|
||||
') || true
|
||||
if [ -z "$wsid" ]; then
|
||||
# mock org-import is the REQUIRE-LIVE backbone and is EXPECTED to succeed in
|
||||
# CI now that the e2e-api job wires an admin token (ADMIN_TOKEN on the
|
||||
# platform + MOLECULE_ADMIN_TOKEN sent above). A missing id here is a REAL
|
||||
# break (admin-auth wiring, org-import create, or the mock short-circuit) and
|
||||
# MUST red the gate — so this is a hard fail(), not a best-effort miss. Under
|
||||
# E2E_REQUIRE_LIVE=1 a FAIL also forces a non-zero exit via
|
||||
# evaluate_require_live_gate. Surface the response so the break is visible
|
||||
# (e.g. {"error":"admin auth required"} would mean the token wiring regressed).
|
||||
fail "create mock workspace (org-import)" "$import_resp"
|
||||
return 0
|
||||
fi
|
||||
CREATED_WSIDS+=("$wsid")
|
||||
echo " workspace=$wsid"
|
||||
|
||||
# Mock goes straight to online (no container boot) — a short budget is
|
||||
# plenty; if it is NOT online quickly the mock short-circuit in
|
||||
# createWorkspaceTree is genuinely broken and the gate SHOULD red.
|
||||
local final
|
||||
final=$(wait_for_status "$wsid" "online failed" 60) || true
|
||||
if [ "$final" != "online" ]; then
|
||||
fail "mock workspace reaches online" "final status: $final (mock should go online without provisioning)"
|
||||
return 0
|
||||
fi
|
||||
pass "mock workspace reaches online"
|
||||
|
||||
# Mock workspaces are not created with an inline token; mint one via the
|
||||
# admin endpoint (same fallback every other arm uses).
|
||||
local token
|
||||
token=$(e2e_mint_workspace_token "$wsid") || true
|
||||
if [ -z "$token" ]; then
|
||||
fail "resolve mock workspace token" "no token returned from POST /admin/workspaces/:id/tokens"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# A2A round-trip. The mock proxy returns a canned non-error reply (one
|
||||
# of the "On it!" variants) — NOT "PONG" — so we validate on the
|
||||
# non-empty branch. A non-error, non-empty reply means the A2A proxy
|
||||
# short-circuit + reply-shape contract are intact end-to-end.
|
||||
local reply
|
||||
if reply=$(send_test_prompt "$wsid" "$token"); then
|
||||
validated "mock reply non-empty (canned; first 80 chars: ${reply:0:80})"
|
||||
assert_activity_logged "mock" "$wsid" "$token"
|
||||
else
|
||||
fail "mock reply" "${reply:-<empty or error>} (mock A2A short-circuit should always return a canned reply)"
|
||||
fi
|
||||
}
|
||||
|
||||
####################################################################
|
||||
# MiniMax live arm — OPPORTUNISTIC (best-effort) real-LLM arm.
|
||||
####################################################################
|
||||
# NOTE: this is now a BEST-EFFORT arm, not the REQUIRE-LIVE backbone.
|
||||
# mock (run_mock above) is the guaranteed, no-key validation that keeps
|
||||
# the gate honest. MiniMax-create is fragile in CI: the namespaced model
|
||||
# id minimax:MiniMax-M2.7 is NOT in claude-code's native model set and
|
||||
# does NOT resolve via DeriveProvider (its only prefix-owner, byok-minimax,
|
||||
# is not wired as a claude-code runtime arm), so the create is rejected
|
||||
# 422 UNREGISTERED_MODEL_FOR_RUNTIME before any provisioning (RCA core
|
||||
# registry_gen.go Runtimes["claude-code"]). Rather than red the REQUIRED
|
||||
# gate on that registry-skew (or on any transient MiniMax provisioning /
|
||||
# model-registration issue), this arm reports a best-effort MISS via
|
||||
# bestfail() and lets mock carry the validation. If MiniMax DOES come up
|
||||
# it validates as a bonus real-LLM check.
|
||||
# Drives the claude-code runtime against MiniMax (BYOK) using the
|
||||
# already-present Gitea secret MOLECULE_STAGING_MINIMAX_API_KEY,
|
||||
# surfaced into the env as E2E_MINIMAX_API_KEY (same name + secret the
|
||||
# staging-smoke / continuous-synth canaries use — see staging-smoke.yml
|
||||
# and continuous-synth-e2e.yml). NO new credential is introduced.
|
||||
#
|
||||
# Why this is the arm that keeps the REQUIRED gate honest:
|
||||
# - claude-code's `minimax` provider (providers.yaml / registry_gen.go)
|
||||
# is third_party_anthropic_compat: it reads MINIMAX_API_KEY at boot
|
||||
# and routes ANTHROPIC_BASE_URL → api.minimax.io/anthropic. So the
|
||||
# ONLY tenant secret needed is {"MINIMAX_API_KEY": <key>} — exactly
|
||||
# the SECRETS_JSON branch test_staging_full_saas.sh uses.
|
||||
# - Model id is the NAMESPACED colon-form `minimax:MiniMax-M2.7`, the
|
||||
# registered BYOK arm for claude-code (registry_gen.go Runtimes
|
||||
# ["claude-code"]["minimax"]). Per core#2263 the BARE `MiniMax-M2`
|
||||
# id can 400 on a registry-skewed ws-server build; the namespaced
|
||||
# form resolves the way kimi's `moonshot/…` does, so it's the
|
||||
# robust choice for the gate.
|
||||
run_minimax() {
|
||||
echo ""
|
||||
echo "=== minimax (claude-code BYOK) happy path ==="
|
||||
if [ -z "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
skip "E2E_MINIMAX_API_KEY not set (MiniMax live arm needs the MiniMax key)"
|
||||
return 0
|
||||
fi
|
||||
local secrets
|
||||
secrets=$(python3 -c "
|
||||
import json, os
|
||||
# claude-code's minimax provider (third_party_anthropic_compat) reads
|
||||
# MINIMAX_API_KEY and points ANTHROPIC_BASE_URL at api.minimax.io/anthropic
|
||||
# at boot — so the ONLY tenant secret needed is the MiniMax key itself.
|
||||
print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))
|
||||
")
|
||||
local resp wsid
|
||||
# Namespaced BYOK model id (core#2263): bare MiniMax-M2 can 400 on a
|
||||
# registry-skewed ws-server build; minimax:MiniMax-M2.7 is the
|
||||
# registered claude-code BYOK arm and resolves like kimi's moonshot/…
|
||||
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"Priority E2E (minimax)\",\"runtime\":\"claude-code\",\"model\":\"minimax:MiniMax-M2.7\",\"tier\":1,\"secrets\":$secrets}")
|
||||
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
|
||||
if [ -z "$wsid" ]; then
|
||||
# BEST-EFFORT: MiniMax-create is fragile (see header — the namespaced
|
||||
# model id is registry-skewed → 422). Do NOT red the gate; mock is the
|
||||
# required backbone. Report the create response so the skew is visible.
|
||||
bestfail "create minimax workspace (best-effort; mock carries the gate)" "$resp"
|
||||
return 0
|
||||
fi
|
||||
CREATED_WSIDS+=("$wsid")
|
||||
echo " workspace=$wsid"
|
||||
|
||||
# claude-code runtime image is already pulled; cold boot ~30-90s. The
|
||||
# first MiniMax cold-call can be slow but that's covered by send_test_prompt's
|
||||
# --max-time 180.
|
||||
local final
|
||||
final=$(wait_for_status "$wsid" "online failed" 240) || true
|
||||
if [ "$final" != "online" ]; then
|
||||
bestfail "minimax workspace reaches online (best-effort)" "final status: $final"
|
||||
return 0
|
||||
fi
|
||||
pass "minimax workspace reaches online"
|
||||
|
||||
local token
|
||||
token=$(echo "$resp" | e2e_extract_token)
|
||||
if [ -z "$token" ]; then
|
||||
token=$(e2e_mint_workspace_token "$wsid")
|
||||
fi
|
||||
if [ -z "$token" ]; then
|
||||
bestfail "resolve minimax workspace token (best-effort)" "no token returned"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local reply
|
||||
if reply=$(send_test_prompt "$wsid" "$token"); then
|
||||
if echo "$reply" | grep -q "PONG"; then
|
||||
validated "minimax reply contains PONG"
|
||||
else
|
||||
validated "minimax reply non-empty (first 80 chars: ${reply:0:80})"
|
||||
fi
|
||||
assert_activity_logged "minimax" "$wsid" "$token"
|
||||
else
|
||||
bestfail "minimax reply (best-effort)" "${reply:-<empty or error>}"
|
||||
fi
|
||||
}
|
||||
|
||||
# `mock` runs FIRST and by default: it is the no-key REQUIRE-LIVE backbone
|
||||
# that guarantees >=1 validation on a healthy platform (see run_mock). The
|
||||
# real-LLM arms (claude-code/codex/hermes/openclaw/minimax) run if their
|
||||
# secrets are present and add real-provider coverage on top; minimax is
|
||||
# best-effort (never reds the gate).
|
||||
WANT="${E2E_RUNTIMES:-mock claude-code codex hermes openclaw minimax}"
|
||||
WANT="${E2E_RUNTIMES:-claude-code codex hermes openclaw}"
|
||||
for r in $WANT; do
|
||||
case "$r" in
|
||||
mock) run_mock ;;
|
||||
claude-code) run_claude_code ;;
|
||||
codex) run_codex ;;
|
||||
hermes) run_hermes ;;
|
||||
openclaw) run_openclaw ;;
|
||||
minimax) run_minimax ;;
|
||||
all) run_mock; run_claude_code; run_codex; run_hermes; run_openclaw; run_minimax ;;
|
||||
all) run_claude_code; run_codex; run_hermes; run_openclaw ;;
|
||||
*) echo "unknown runtime in E2E_RUNTIMES: $r" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped, $VALIDATED runtime(s) validated end-to-end ==="
|
||||
|
||||
# Final exit decision lives in evaluate_require_live_gate (defined at the top of
|
||||
# this file, before any platform I/O) so the same logic is unit-tested in
|
||||
# isolation by test_require_live_priority_gate_unit.sh. Mirror its return code
|
||||
# into the process exit code.
|
||||
evaluate_require_live_gate
|
||||
exit $?
|
||||
echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ==="
|
||||
[ "$FAIL" -eq 0 ]
|
||||
|
||||
@@ -1,535 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Live staging E2E — the CP instance-state reconciler heals a terminated EC2.
|
||||
#
|
||||
# Real-infra complement to the deterministic unit tests for core#2261
|
||||
# (workspace-server/internal/registry/cp_instance_reconciler.go). Those unit
|
||||
# tests pin the reconcile logic against fakes; THIS script proves the loop
|
||||
# actually runs in a real tenant's workspace-server and drives the EXISTING
|
||||
# offline + auto-heal machinery against real AWS.
|
||||
#
|
||||
# Root regression (core#2247): a SaaS workspace whose EC2 is terminated out
|
||||
# from under the platform (manual AWS action, spot reclaim, CP reap) fell
|
||||
# through every existing liveness pass and kept reading status='online'
|
||||
# forever, pointing at a dead instance. The reconciler closes that gap with
|
||||
# CPProvisioner.IsRunning and feeds a clean "not running" into onOffline →
|
||||
# RestartByID (existing-volume reprovision).
|
||||
#
|
||||
# What this test does:
|
||||
# 1. Provision a fresh staging org + ONE workspace (same default
|
||||
# runtime/model as the full-saas harness, so it actually boots).
|
||||
# 2. Poll the tenant API until the workspace is status=online; capture its
|
||||
# instance_id.
|
||||
# 3. KILL it — terminate that exact EC2 via `aws ec2 terminate-instances`.
|
||||
# 4. Assert the reconciler heals it:
|
||||
# PRIMARY (gate) — within ~180s the workspace status LEAVES
|
||||
# 'online' (the reconciler detected the dead
|
||||
# instance via IsRunning and flipped it). This
|
||||
# is the core regression guard: a dead instance
|
||||
# must NOT keep reading 'online'.
|
||||
# SECONDARY (best-effort) — within ~10 min it auto-reprovisions:
|
||||
# status returns to 'online' with a NEW
|
||||
# instance_id (onOffline → RestartByID
|
||||
# existing-volume heal). If reprovision doesn't
|
||||
# finish in the bound we log it clearly but let
|
||||
# the PRIMARY assertion stand as the gate (see
|
||||
# the comment at the secondary block — a future
|
||||
# tightening that promotes this to a hard gate is
|
||||
# deliberately one edit away).
|
||||
# 5. Teardown ALWAYS (EXIT trap): delete the tenant + leak-sweep so no EC2
|
||||
# is orphaned, even on a mid-test failure.
|
||||
#
|
||||
# Auth model + provisioning conventions are copied verbatim from
|
||||
# test_staging_full_saas.sh (single MOLECULE_ADMIN_TOKEN → CP admin; per-
|
||||
# tenant admin token + X-Molecule-Org-Id header for tenant API). The kill
|
||||
# primitive + leak sweep reuse lib/aws_leak_check.sh.
|
||||
#
|
||||
# Required env:
|
||||
# MOLECULE_CP_URL default: https://staging-api.moleculesai.app
|
||||
# MOLECULE_ADMIN_TOKEN CP admin bearer — Railway staging CP_ADMIN_API_TOKEN
|
||||
#
|
||||
# Optional env (mirrors the full-saas harness where they overlap):
|
||||
# E2E_RUNTIME claude-code (default)
|
||||
# E2E_PROVISION_TIMEOUT_SECS default 900 (cold EC2 budget)
|
||||
# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 900 (15min). A workspace that
|
||||
# cannot reach online in 15min is a staging/boot problem,
|
||||
# not slow cold-boot — fail fast so the trap tears down the
|
||||
# EC2 instead of hanging ~1h and leaking a running instance
|
||||
# (observed: run 216031 hung 32min with a live e2e-rec EC2).
|
||||
# E2E_RECONCILE_OFFLINE_TIMEOUT_SECS default 180 (PRIMARY: leave 'online'.
|
||||
# Reconciler cadence is 60s — 3 cycles +
|
||||
# AWS terminate-visibility slack.)
|
||||
# E2E_REPROVISION_TIMEOUT_SECS default 600 (SECONDARY: back to online
|
||||
# with a NEW instance_id)
|
||||
# E2E_MINIMAX_API_KEY / E2E_ANTHROPIC_API_KEY / E2E_OPENAI_API_KEY
|
||||
# LLM key (same priority chain as
|
||||
# full-saas; needed so the FIRST boot
|
||||
# reaches online). Empty → '{}' (the
|
||||
# workspace still boots online; the LLM
|
||||
# key only matters for a completion,
|
||||
# which this test never makes).
|
||||
# E2E_KEEP_ORG 1 → skip teardown (debugging only)
|
||||
# E2E_RUN_ID Slug suffix; CI: ${GITHUB_RUN_ID}
|
||||
# E2E_AWS_LEAK_CHECK auto (default) | required | off
|
||||
# E2E_AWS_TERMINATE_LEAKS 1 → terminate slug-tagged leaked EC2 at
|
||||
# teardown
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 happy path (PRIMARY assertion held; SECONDARY logged either way)
|
||||
# 1 generic failure (incl. PRIMARY assertion failed = regression)
|
||||
# 2 missing required env
|
||||
# 3 provisioning timed out
|
||||
# 4 teardown left orphan resources
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
|
||||
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
|
||||
RUNTIME="${E2E_RUNTIME:-claude-code}"
|
||||
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
|
||||
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-900}"
|
||||
# PRIMARY bound: the reconciler ticks every 60s; it needs one cycle to see
|
||||
# the dead instance after AWS makes the terminate visible to DescribeInstances
|
||||
# (typically seconds, but can lag). 180s = ~3 cycles + slack.
|
||||
RECONCILE_OFFLINE_TIMEOUT_SECS="${E2E_RECONCILE_OFFLINE_TIMEOUT_SECS:-180}"
|
||||
# SECONDARY bound: full existing-volume reprovision (new EC2 boot + agent
|
||||
# bootstrap) is a multi-minute cold path.
|
||||
REPROVISION_TIMEOUT_SECS="${E2E_REPROVISION_TIMEOUT_SECS:-600}"
|
||||
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
|
||||
|
||||
# Slug MUST start with e2e- so sweep-stale-e2e-orgs.yml reaps any orphan this
|
||||
# run leaks (lint_cleanup_traps.sh enforces the e2e-/rt-e2e- prefix for any
|
||||
# staging tenant E2E; we honour it here too even though our filename isn't
|
||||
# *staging*).
|
||||
SLUG="e2e-rec-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
|
||||
SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
|
||||
|
||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
# Per-runtime model slug dispatch — shared with the full-saas harness.
|
||||
# shellcheck disable=SC1091
|
||||
# shellcheck source=lib/model_slug.sh
|
||||
source "$(dirname "$0")/lib/model_slug.sh"
|
||||
# AWS kill primitive + leak sweep (e2e_aws_region / e2e_ec2_instances_for_slug /
|
||||
# e2e_terminate_instances / e2e_verify_no_ec2_leaks_for_slug).
|
||||
# shellcheck disable=SC1091
|
||||
# shellcheck source=lib/aws_leak_check.sh
|
||||
source "$(dirname "$0")/lib/aws_leak_check.sh"
|
||||
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
|
||||
# ─── cleanup trap ───────────────────────────────────────────────────────
|
||||
# Identical teardown contract to test_staging_full_saas.sh: delete the
|
||||
# tenant (synchronous GDPR cascade), poll for the org row to disappear, then
|
||||
# assert no slug-tagged EC2 survives. A leaked resource at teardown is a CI
|
||||
# failure (exit 4). The trap is installed UP-FRONT so a mid-test failure
|
||||
# (including a failed PRIMARY assertion) still cleans up.
|
||||
CLEANUP_DONE=0
|
||||
cleanup_org() {
|
||||
# Capture upstream exit code IMMEDIATELY — must be the first statement in
|
||||
# the trap, before any command (including the CLEANUP_DONE check) clobbers $?.
|
||||
local entry_rc=$?
|
||||
|
||||
if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi
|
||||
CLEANUP_DONE=1
|
||||
|
||||
if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
|
||||
log "E2E_KEEP_ORG=1 — skipping teardown. Manually delete $SLUG when done."
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "🧹 Tearing down org $SLUG..."
|
||||
|
||||
# 120s curl budget for the synchronous DELETE cascade (EC2 terminate alone
|
||||
# is 30-60s), then poll up to 60s for organizations.status='purged'/gone.
|
||||
if curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1; then
|
||||
ok "Teardown request accepted"
|
||||
else
|
||||
log "Teardown returned non-2xx (may already be gone)"
|
||||
fi
|
||||
|
||||
local leak_count=1
|
||||
local elapsed=0
|
||||
while [ "$elapsed" -lt 60 ]; do
|
||||
leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
|
||||
2>/dev/null || echo 1)
|
||||
if [ "$leak_count" = "0" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
if [ "$leak_count" != "0" ]; then
|
||||
echo "⚠️ LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2
|
||||
exit 4
|
||||
fi
|
||||
local aws_leak_rc=0
|
||||
e2e_verify_no_ec2_leaks_for_slug "$SLUG" || aws_leak_rc=$?
|
||||
if [ "$aws_leak_rc" != "0" ]; then
|
||||
case "$aws_leak_rc" in
|
||||
2) exit 2 ;;
|
||||
*) exit 4 ;;
|
||||
esac
|
||||
fi
|
||||
ok "Teardown clean — no orphan org or EC2 resources for $SLUG (${elapsed}s)"
|
||||
|
||||
# Normalize unexpected upstream exit codes to 1 — `set -e` propagates the
|
||||
# raw exit code of the failing command (e.g. curl exits 22 under
|
||||
# --fail-with-body), but this script's contract only emits {0,1,2,3,4}.
|
||||
case "$entry_rc" in
|
||||
0|1|2|3|4) ;;
|
||||
*) exit 1 ;;
|
||||
esac
|
||||
}
|
||||
trap cleanup_org EXIT INT TERM
|
||||
|
||||
# ─── 0. Preflight ───────────────────────────────────────────────────────
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
log " Staging reconciler-heals-terminated-instance E2E (core#2261)"
|
||||
log " CP: $CP_URL"
|
||||
log " Slug: $SLUG"
|
||||
log " Runtime: $RUNTIME"
|
||||
log " Online timeout: ${WORKSPACE_ONLINE_TIMEOUT_SECS}s"
|
||||
log " PRIMARY (offline): ${RECONCILE_OFFLINE_TIMEOUT_SECS}s"
|
||||
log " SECONDARY (reprov): ${REPROVISION_TIMEOUT_SECS}s"
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
|
||||
log "0/6 Preflight: CP reachable?"
|
||||
curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed"
|
||||
ok "CP reachable"
|
||||
|
||||
admin_call() {
|
||||
local method="$1"; shift
|
||||
local path="$1"; shift
|
||||
curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
"$@"
|
||||
}
|
||||
|
||||
# ─── 1. Create org ──────────────────────────────────────────────────────
|
||||
log "1/6 Creating org $SLUG via /cp/admin/orgs..."
|
||||
CREATE_RESP=$(admin_call POST /cp/admin/orgs \
|
||||
-d "{\"slug\":\"$SLUG\",\"name\":\"E2E $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
|
||||
echo "$CREATE_RESP" | python3 -m json.tool >/dev/null || fail "Org create returned non-JSON: $CREATE_RESP"
|
||||
ORG_ID=$(echo "$CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
|
||||
[ -z "$ORG_ID" ] && fail "Org create response missing 'id': $CREATE_RESP"
|
||||
ok "Org created (id=$ORG_ID)"
|
||||
|
||||
# ─── 2. Wait for tenant provisioning ────────────────────────────────────
|
||||
log "2/6 Waiting for tenant provisioning (up to ${PROVISION_TIMEOUT_SECS}s)..."
|
||||
DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
|
||||
LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$DEADLINE" ]; then
|
||||
fail "Tenant provisioning timed out after ${PROVISION_TIMEOUT_SECS}s (last: $LAST_STATUS)"
|
||||
fi
|
||||
LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}')
|
||||
# /cp/admin/orgs exposes 'instance_status' (org_instances.status), NOT 'status'.
|
||||
STATUS=$(echo "$LIST_JSON" | python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
for o in d.get('orgs', []):
|
||||
if o.get('slug') == '$SLUG':
|
||||
print(o.get('instance_status', ''))
|
||||
sys.exit(0)
|
||||
print('')
|
||||
" 2>/dev/null || echo "")
|
||||
if [ "$STATUS" != "$LAST_STATUS" ]; then
|
||||
log " status → $STATUS"
|
||||
LAST_STATUS="$STATUS"
|
||||
fi
|
||||
case "$STATUS" in
|
||||
running) break ;;
|
||||
failed)
|
||||
log "── DIAGNOSTIC BURST (step 2 — tenant provisioning failed) ──"
|
||||
echo "$LIST_JSON" | python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
for o in d.get('orgs', []):
|
||||
if o.get('slug') == '$SLUG':
|
||||
print(json.dumps(o, indent=2))
|
||||
sys.exit(0)
|
||||
print('(no org row found for slug=$SLUG — DB drift?)')
|
||||
" 2>&1 | sed 's/^/ /'
|
||||
log "── END DIAGNOSTIC ──"
|
||||
# Tenant provisioning failures are a CP-side fault, not a reconciler
|
||||
# regression — exit 3 (provisioning) to keep the signal honest.
|
||||
echo "[$(date +%H:%M:%S)] ❌ Tenant provisioning failed for $SLUG (see diagnostic above)" >&2
|
||||
exit 3
|
||||
;;
|
||||
*) sleep 15 ;;
|
||||
esac
|
||||
done
|
||||
ok "Tenant provisioning complete"
|
||||
|
||||
# Derive tenant domain from CP hostname (same logic as the full-saas harness).
|
||||
CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
|
||||
case "$CP_HOST" in
|
||||
api.*) DERIVED_DOMAIN="${CP_HOST#api.}" ;;
|
||||
staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
|
||||
*) DERIVED_DOMAIN="$CP_HOST" ;;
|
||||
esac
|
||||
TENANT_DOMAIN="${MOLECULE_TENANT_DOMAIN:-$DERIVED_DOMAIN}"
|
||||
TENANT_URL="https://$SLUG.$TENANT_DOMAIN"
|
||||
log " TENANT_URL=$TENANT_URL"
|
||||
|
||||
# ─── 3. Retrieve per-tenant admin token ────────────────────────────────
|
||||
log "3/6 Fetching per-tenant admin token..."
|
||||
TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
|
||||
TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null || echo "")
|
||||
[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token for $SLUG"
|
||||
ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
|
||||
|
||||
# Wait for tenant TLS / DNS propagation before any tenant API call.
|
||||
log " Waiting for tenant TLS / DNS propagation..."
|
||||
TLS_DEADLINE=$(( $(date +%s) + 15 * 60 ))
|
||||
while true; do
|
||||
if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
|
||||
fail "Tenant URL never responded 2xx on /health within 15m"
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
ok "Tenant reachable at $TENANT_URL"
|
||||
|
||||
tenant_call() {
|
||||
local method="$1"; shift
|
||||
local path="$1"; shift
|
||||
# X-Molecule-Org-Id is REQUIRED — the tenant guard 404s anything without it
|
||||
# (it does NOT 403, to hide tenant existence from org scanners).
|
||||
curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \
|
||||
-H "Authorization: Bearer $TENANT_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
"$@"
|
||||
}
|
||||
|
||||
# Helper: read a single field off GET /workspaces/<id>. Echoes '' on any
|
||||
# error so callers can poll without `set -e` aborting on a transient blip.
|
||||
ws_field() {
|
||||
local wid="$1"; local field="$2"
|
||||
tenant_call GET "/workspaces/$wid" 2>/dev/null \
|
||||
| python3 -c "import json,sys; print(json.load(sys.stdin).get('$field') or '')" 2>/dev/null \
|
||||
|| echo ""
|
||||
}
|
||||
|
||||
# ─── 4. Provision ONE workspace ─────────────────────────────────────────
|
||||
# Same secrets-injection priority chain as the full-saas harness so the
|
||||
# FIRST boot reaches online. We never make a completion in this test (the
|
||||
# whole exercise is instance-state, not the LLM), so an absent key is
|
||||
# tolerable — but wiring the same keys keeps boot behaviour identical to the
|
||||
# sibling and avoids a config path that only this test would exercise.
|
||||
SECRETS_JSON='{}'
|
||||
# Platform-managed path (E2E_LLM_PATH=platform, the DEFAULT for this test):
|
||||
# the workspace boots on the CP LLM proxy with NO tenant key, model
|
||||
# moonshot/kimi-k2.6 — the exact create combo test_staging_full_saas.sh uses
|
||||
# successfully. This test only needs the workspace to reach status=online so
|
||||
# it can kill the EC2 and assert the reconciler heals it; it does NOT exercise
|
||||
# a real LLM completion, so the platform path is both sufficient and the one
|
||||
# proven to create cleanly. (The BYOK key paths below 400'd at create — see
|
||||
# the create-failure capture added below — which is why platform is default.)
|
||||
if [ "${E2E_LLM_PATH:-platform}" = "platform" ]; then
|
||||
log " LLM path: PLATFORM-MANAGED (no tenant key; moonshot/kimi-k2.6 via proxy)"
|
||||
SECRETS_JSON='{}'
|
||||
elif [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))")
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'ANTHROPIC_API_KEY': os.environ['E2E_ANTHROPIC_API_KEY']}))")
|
||||
elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "
|
||||
import json, os
|
||||
k = os.environ['E2E_OPENAI_API_KEY']
|
||||
print(json.dumps({
|
||||
'OPENAI_API_KEY': k,
|
||||
'OPENAI_BASE_URL': 'https://api.openai.com/v1',
|
||||
'MODEL_PROVIDER': 'openai:gpt-4o',
|
||||
'HERMES_INFERENCE_PROVIDER': 'custom',
|
||||
'HERMES_CUSTOM_BASE_URL': 'https://api.openai.com/v1',
|
||||
'HERMES_CUSTOM_API_KEY': k,
|
||||
'HERMES_CUSTOM_API_MODE': 'chat_completions',
|
||||
}))
|
||||
")
|
||||
fi
|
||||
|
||||
E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" MODEL_SLUG=$(E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" pick_model_slug "$RUNTIME")
|
||||
log " MODEL_SLUG=$MODEL_SLUG"
|
||||
|
||||
log "4/6 Provisioning workspace (runtime=$RUNTIME)..."
|
||||
# --fail-with-body makes curl exit non-zero on a 4xx/5xx but STILL writes the
|
||||
# response body to stdout; the `|| { ... }` catches that so the body is printed
|
||||
# instead of `set -e` aborting the command-substitution silently (the old bug
|
||||
# that hid the real HTTP-400 reason). $WS_RESP holds the body either way.
|
||||
WS_RESP=$(tenant_call POST /workspaces \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}") || {
|
||||
rc=$?
|
||||
fail "Workspace create failed (curl rc=$rc, model=$MODEL_SLUG). Response body: $WS_RESP"
|
||||
}
|
||||
WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
|
||||
[ -z "$WS_ID" ] && fail "Workspace create response missing 'id' (model=$MODEL_SLUG): $WS_RESP"
|
||||
log " WS_ID=$WS_ID"
|
||||
|
||||
# Wait for the workspace to reach status=online and capture its instance_id.
|
||||
log " Waiting for workspace to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)..."
|
||||
ONLINE_DEADLINE=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS ))
|
||||
ORIGINAL_INSTANCE_ID=""
|
||||
ONLINE_SINCE=""
|
||||
# Grace before falling back to the AWS workspace tag when the tenant API
|
||||
# does not surface instance_id (observed on staging).
|
||||
INSTANCE_ID_GRACE_SECS="${E2E_INSTANCE_ID_GRACE_SECS:-45}"
|
||||
WS_LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$ONLINE_DEADLINE" ]; then
|
||||
WS_LAST_ERR=$(ws_field "$WS_ID" "last_sample_error")
|
||||
fail "Workspace $WS_ID never reached status=online within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR)"
|
||||
fi
|
||||
WS_STATUS=$(ws_field "$WS_ID" "status")
|
||||
if [ "$WS_STATUS" != "$WS_LAST_STATUS" ]; then
|
||||
log " $WS_ID → $WS_STATUS"
|
||||
WS_LAST_STATUS="$WS_STATUS"
|
||||
fi
|
||||
if [ "$WS_STATUS" = "online" ]; then
|
||||
[ -z "$ONLINE_SINCE" ] && ONLINE_SINCE=$(date +%s)
|
||||
ORIGINAL_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
|
||||
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
break
|
||||
fi
|
||||
# The workspace is online but the tenant API does not surface instance_id
|
||||
# (observed on staging — the DB has it, the API response omits it). After a
|
||||
# short grace, fall back to the AWS workspace-instance tag so the kill step
|
||||
# can proceed. The reconciler reads instance_id from the DB and acts on the
|
||||
# real EC2 regardless of what the API surfaces, so the AWS-tag instance is
|
||||
# the correct kill target. Without this fallback the loop spins to the online
|
||||
# deadline and fails with a misleading "never reached online".
|
||||
if [ $(( $(date +%s) - ONLINE_SINCE )) -ge "$INSTANCE_ID_GRACE_SECS" ]; then
|
||||
# ws-tenant-<slug>-<wsid...> is the workspace EC2 (vs tenant-<slug>).
|
||||
ORIGINAL_INSTANCE_ID=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null \
|
||||
| awk '$2 ~ /^ws-tenant-/ {print $1}' | sort -u | head -1)
|
||||
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
log " instance_id not surfaced by API after ${INSTANCE_ID_GRACE_SECS}s — using AWS workspace tag: $ORIGINAL_INSTANCE_ID"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
log " $WS_ID online but instance_id not populated yet — waiting"
|
||||
fi
|
||||
# 'failed' is transient on cold boot (bootstrap-watcher deadline vs heartbeat
|
||||
# recovery, cp#245). Keep polling; only the deadline hard-fails.
|
||||
sleep 10
|
||||
done
|
||||
ok "Workspace online (instance_id=$ORIGINAL_INSTANCE_ID)"
|
||||
|
||||
# ─── 5. Kill the EC2 ────────────────────────────────────────────────────
|
||||
# Terminate the EXACT instance the workspace reported. Prefer the captured
|
||||
# instance_id (precise — kills only this workspace's box); fall back to the
|
||||
# slug-tag describe if the API didn't surface an id (shouldn't happen — we
|
||||
# only break out of the online-wait once instance_id is non-empty).
|
||||
log "5/6 KILLING the workspace EC2 to simulate an out-of-band termination..."
|
||||
if ! e2e_aws_creds_available; then
|
||||
fail "AWS CLI/creds unavailable — cannot terminate the EC2 to exercise the reconciler. Set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY (the CI workflow wires these)."
|
||||
fi
|
||||
AWS_REGION_RESOLVED=$(e2e_aws_region)
|
||||
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
log " Terminating $ORIGINAL_INSTANCE_ID in $AWS_REGION_RESOLVED (aws ec2 terminate-instances)..."
|
||||
aws ec2 terminate-instances --region "$AWS_REGION_RESOLVED" --instance-ids "$ORIGINAL_INSTANCE_ID" >/dev/null \
|
||||
|| fail "aws ec2 terminate-instances failed for $ORIGINAL_INSTANCE_ID"
|
||||
KILLED_IDS="$ORIGINAL_INSTANCE_ID"
|
||||
else
|
||||
# Fallback path — find by slug tag and terminate.
|
||||
log " instance_id was empty — falling back to slug-tag describe ($SLUG)..."
|
||||
ROWS=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null || echo "")
|
||||
KILLED_IDS=$(echo "$ROWS" | awk 'NF {print $1}' | sort -u | tr '\n' ' ')
|
||||
[ -n "$KILLED_IDS" ] || fail "No slug-tagged EC2 found for $SLUG — nothing to terminate"
|
||||
log " Terminating $KILLED_IDS in $AWS_REGION_RESOLVED..."
|
||||
e2e_terminate_instances "$KILLED_IDS" || fail "terminate-instances failed for $KILLED_IDS"
|
||||
fi
|
||||
ok "Terminated EC2: $KILLED_IDS — reconciler should now detect the dead instance"
|
||||
|
||||
# ─── 6a. PRIMARY assertion — workspace leaves 'online' ─────────────────
|
||||
# This is THE regression gate for core#2261/#2247. The reconciler runs every
|
||||
# 60s in the tenant's workspace-server; when CPProvisioner.IsRunning returns a
|
||||
# clean "not running" for the terminated EC2, onOffline flips the row off
|
||||
# 'online'. A dead instance that keeps reading 'online' is exactly the bug.
|
||||
log "6a/6 PRIMARY: asserting workspace leaves 'online' within ${RECONCILE_OFFLINE_TIMEOUT_SECS}s (reconciler heal-detection)..."
|
||||
OFFLINE_DEADLINE=$(( $(date +%s) + RECONCILE_OFFLINE_TIMEOUT_SECS ))
|
||||
LEFT_ONLINE=0
|
||||
REC_LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$OFFLINE_DEADLINE" ]; then
|
||||
break
|
||||
fi
|
||||
REC_STATUS=$(ws_field "$WS_ID" "status")
|
||||
if [ "$REC_STATUS" != "$REC_LAST_STATUS" ]; then
|
||||
log " $WS_ID status → ${REC_STATUS:-<empty>}"
|
||||
REC_LAST_STATUS="$REC_STATUS"
|
||||
fi
|
||||
# Any non-online status (offline/provisioning/awaiting_agent/restarting/…)
|
||||
# proves the reconciler acted. We deliberately don't pin the exact target
|
||||
# status: onOffline flips offline AND kicks RestartByID, so the row may race
|
||||
# straight into a provisioning/restarting state — all of which are "no longer
|
||||
# falsely online".
|
||||
if [ -n "$REC_STATUS" ] && [ "$REC_STATUS" != "online" ]; then
|
||||
LEFT_ONLINE=1
|
||||
ok "PRIMARY held — workspace left 'online' (now '$REC_STATUS') after EC2 termination"
|
||||
break
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ "$LEFT_ONLINE" != "1" ]; then
|
||||
fail "PRIMARY FAILED (core#2261 regression): workspace $WS_ID still reads status=online ${RECONCILE_OFFLINE_TIMEOUT_SECS}s after its EC2 ($KILLED_IDS) was terminated. The reconciler did NOT detect the dead instance — a terminated EC2 is masquerading as a healthy workspace."
|
||||
fi
|
||||
|
||||
# ─── 6b. SECONDARY assertion — auto-reprovision (best-effort) ──────────
|
||||
# The onOffline → RestartByID existing-volume heal should bring the workspace
|
||||
# back to 'online' on a NEW instance_id. This is best-effort: a full EC2 cold
|
||||
# reprovision is a multi-minute path that shares the same boot-flake surface
|
||||
# as the initial provision. If it doesn't finish within the bound we LOG it
|
||||
# clearly but DO NOT fail — the PRIMARY assertion above is the gate.
|
||||
#
|
||||
# FUTURE TIGHTENING (deliberately one edit away): once this reprovision path
|
||||
# is proven reliable on staging, promote the `log "SECONDARY ..."` soft-miss
|
||||
# below to a `fail ...` so a stuck reprovision becomes a hard gate.
|
||||
log "6b/6 SECONDARY (best-effort): asserting auto-reprovision to online with a NEW instance_id within ${REPROVISION_TIMEOUT_SECS}s..."
|
||||
REPROV_DEADLINE=$(( $(date +%s) + REPROVISION_TIMEOUT_SECS ))
|
||||
REPROV_OK=0
|
||||
REPROV_LAST_STATUS=""
|
||||
NEW_INSTANCE_ID=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$REPROV_DEADLINE" ]; then
|
||||
break
|
||||
fi
|
||||
RP_STATUS=$(ws_field "$WS_ID" "status")
|
||||
if [ "$RP_STATUS" != "$REPROV_LAST_STATUS" ]; then
|
||||
log " $WS_ID status → ${RP_STATUS:-<empty>}"
|
||||
REPROV_LAST_STATUS="$RP_STATUS"
|
||||
fi
|
||||
if [ "$RP_STATUS" = "online" ]; then
|
||||
NEW_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
|
||||
if [ -n "$NEW_INSTANCE_ID" ] && [ "$NEW_INSTANCE_ID" != "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
REPROV_OK=1
|
||||
break
|
||||
fi
|
||||
# online again but instance_id either not surfaced yet or still the old
|
||||
# (terminated) id — keep polling until the reprovision swaps it.
|
||||
fi
|
||||
sleep 15
|
||||
done
|
||||
|
||||
if [ "$REPROV_OK" = "1" ]; then
|
||||
ok "SECONDARY held — auto-reprovisioned to online on NEW instance_id=$NEW_INSTANCE_ID (was $ORIGINAL_INSTANCE_ID)"
|
||||
else
|
||||
# Soft-miss — see FUTURE TIGHTENING note above. PRIMARY is the gate.
|
||||
log "⚠️ SECONDARY not satisfied within ${REPROVISION_TIMEOUT_SECS}s (status=${REPROV_LAST_STATUS:-<empty>}, instance_id=${NEW_INSTANCE_ID:-<none>}, original=$ORIGINAL_INSTANCE_ID). NOT failing — the PRIMARY heal-detection assertion is the gate; reprovision is a slower, flakier cold path. Promote this to a hard fail once it's proven reliable."
|
||||
fi
|
||||
|
||||
ok "Reconciler live E2E PASSED — PRIMARY heal-detection held (SECONDARY: $([ "$REPROV_OK" = "1" ] && echo "held" || echo "soft-miss, logged"))"
|
||||
# Teardown runs via the EXIT trap.
|
||||
@@ -1,124 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE
|
||||
# fail-closed-on-skip guard in test_staging_full_saas.sh.
|
||||
#
|
||||
# WHY (harden/e2e-staging-saas-failclosed): the staging SaaS E2E is being
|
||||
# hardened to become a HARD merge-gate. A gate that can reach its final `ok`
|
||||
# WITHOUT having actually exercised a provision→online→A2A cycle is a
|
||||
# false-green — it would let a refactor that short-circuits the lifecycle
|
||||
# (or a skip path that swallows it) report PASS. require_live_or_die() is the
|
||||
# guard; this test proves it FAILS (exit 5) when milestones are missing and
|
||||
# PASSES when all fired — the watch-it-fail counterpart the dev-SOP requires.
|
||||
#
|
||||
# Runs entirely offline (no LLM, no network, no provisioning) — pure shell
|
||||
# logic — so it can run on every PR in the fast lane and locally via `bash`.
|
||||
set -uo pipefail
|
||||
|
||||
# Scratch dir for the generated guard-runner stubs. EXIT trap guarantees
|
||||
# cleanup even when an assertion exits the test non-zero (lint_cleanup_traps).
|
||||
TMPDIR_E2E=$(mktemp -d -t require-live-guard-XXXXXX)
|
||||
trap 'rm -rf "$TMPDIR_E2E"' EXIT INT TERM
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
# Reproduce the EXACT guard logic from test_staging_full_saas.sh. Kept in
|
||||
# lockstep with the host script: if the host logic changes, this test must
|
||||
# change with it (and a divergence is itself a signal to re-prove the gate).
|
||||
make_guard_runner() {
|
||||
cat <<'EOF'
|
||||
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
|
||||
LIVE_MILESTONES=""
|
||||
live_milestone() {
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $1 "*) ;;
|
||||
*) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
|
||||
esac
|
||||
}
|
||||
require_live_or_die() {
|
||||
[ "$REQUIRE_LIVE" = "1" ] || return 0
|
||||
local required="provisioned tenant_online workspace_online a2a_roundtrip"
|
||||
local m missing=""
|
||||
for m in $required; do
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $m "*) ;;
|
||||
*) missing="$missing $m" ;;
|
||||
esac
|
||||
done
|
||||
if [ -n "$missing" ]; then
|
||||
echo "MISSING:${missing}" >&2
|
||||
exit 5
|
||||
fi
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
# run_case <E2E_REQUIRE_LIVE value> <space-separated milestones to stamp>
|
||||
# echoes the observed exit code.
|
||||
run_case() {
|
||||
local require_live="$1"; shift
|
||||
local milestones="$1"; shift || true
|
||||
local stub observed m
|
||||
stub=$(mktemp "$TMPDIR_E2E/stub.XXXXXX")
|
||||
{
|
||||
echo "#!/usr/bin/env bash"
|
||||
echo "set -uo pipefail"
|
||||
make_guard_runner
|
||||
for m in $milestones; do
|
||||
echo "live_milestone $m"
|
||||
done
|
||||
echo "require_live_or_die"
|
||||
echo 'echo REACHED_END'
|
||||
} > "$stub"
|
||||
E2E_REQUIRE_LIVE="$require_live" bash "$stub" >/dev/null 2>&1
|
||||
observed=$?
|
||||
rm -f "$stub"
|
||||
echo "$observed"
|
||||
}
|
||||
|
||||
assert_rc() {
|
||||
local label="$1" require_live="$2" milestones="$3" expected="$4"
|
||||
local observed
|
||||
observed=$(run_case "$require_live" "$milestones")
|
||||
if [ "$observed" = "$expected" ]; then
|
||||
echo " ✓ $label: REQUIRE_LIVE=$require_live milestones='$milestones' → rc=$observed"
|
||||
PASS=$((PASS+1))
|
||||
else
|
||||
echo " ✗ $label: REQUIRE_LIVE=$require_live milestones='$milestones' expected=$expected OBSERVED=$observed" >&2
|
||||
FAIL=$((FAIL+1))
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== E2E_REQUIRE_LIVE fail-closed-on-skip guard proof ==="
|
||||
echo
|
||||
|
||||
# DECISIVE (false-green trap): REQUIRE_LIVE=1 but NO lifecycle ran → exit 5.
|
||||
assert_rc "require-live, nothing ran → exit 5 (the false-green trap)" \
|
||||
1 "" 5
|
||||
|
||||
# REQUIRE_LIVE=1 with a partial lifecycle (provisioned but no A2A) → exit 5.
|
||||
assert_rc "require-live, partial lifecycle → exit 5" \
|
||||
1 "provisioned tenant_online workspace_online" 5
|
||||
|
||||
# REQUIRE_LIVE=1 with every required milestone → pass (rc=0).
|
||||
assert_rc "require-live, full lifecycle → pass" \
|
||||
1 "provisioned tenant_online workspace_online a2a_roundtrip" 0
|
||||
|
||||
# Idempotency: duplicate stamps don't break membership; full set still passes.
|
||||
assert_rc "require-live, duplicate stamps still pass" \
|
||||
1 "provisioned provisioned tenant_online workspace_online a2a_roundtrip a2a_roundtrip" 0
|
||||
|
||||
# Guard is a no-op when CI did not demand a live run: a non-live local run
|
||||
# with nothing stamped must NOT exit 5 (we don't break local/debug runs).
|
||||
assert_rc "no require-live, nothing ran → pass (guard is opt-in)" \
|
||||
0 "" 0
|
||||
assert_rc "require-live unset-equivalent (0), partial → pass" \
|
||||
0 "provisioned" 0
|
||||
|
||||
# Extra unknown milestone is harmless as long as required set is present.
|
||||
assert_rc "require-live, extra milestone tolerated" \
|
||||
1 "provisioned tenant_online workspace_online a2a_roundtrip extra_thing" 0
|
||||
|
||||
echo
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -1,114 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE zero-validated
|
||||
# gate in test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`).
|
||||
#
|
||||
# WHY (harden/enforce-ci-gates-core-v2, PR #2286): the priority-runtimes E2E's
|
||||
# only historical exit gate was `[ "$FAIL" -eq 0 ]`. When every runtime SKIPs
|
||||
# because no live secret is present — exactly what the CI step did — PASS=0
|
||||
# FAIL=0 and the script exited 0 (GREEN) while validating ZERO runtimes. The
|
||||
# REQUIRED merge gate was therefore false-green: passing without exercising a
|
||||
# single runtime. The fix adds a VALIDATED counter and makes a zero-validated
|
||||
# run RED when E2E_REQUIRE_LIVE is set.
|
||||
#
|
||||
# That zero-validated→RED decision lives in evaluate_require_live_gate() in
|
||||
# test_priority_runtimes_e2e.sh. CI cannot prove it via a live arm — the CI
|
||||
# substrate can't provision ANY runtime end-to-end (MiniMax 422, mock org-
|
||||
# import create fails, claude-code needs a key CI lacks), so the live e2e-api
|
||||
# job does NOT force E2E_REQUIRE_LIVE (that would red the required gate for
|
||||
# everyone). This UNIT test is the regression coverage instead: it drives the
|
||||
# REAL evaluate_require_live_gate() function — not a copy — in isolation by
|
||||
# sourcing the script with E2E_PRIORITY_UNIT_SOURCE=1 (which stops before any
|
||||
# platform I/O), setting the counters, and asserting the gate's return code.
|
||||
#
|
||||
# Because it exercises the actual function, a future revert of the zero-
|
||||
# validated→RED logic in test_priority_runtimes_e2e.sh fails THIS test on
|
||||
# every PR — so the false-green can't silently come back.
|
||||
#
|
||||
# Runs entirely offline (no LLM, no network, no provisioning) — pure shell
|
||||
# logic — so it runs on every PR in the fast lane and locally via `bash`.
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
GATE_SCRIPT="$SCRIPT_DIR/test_priority_runtimes_e2e.sh"
|
||||
|
||||
if [ ! -f "$GATE_SCRIPT" ]; then
|
||||
echo "FATAL: cannot find $GATE_SCRIPT" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
# run_case <E2E_REQUIRE_LIVE value> <VALIDATED count> <FAIL count>
|
||||
# Sources the REAL test_priority_runtimes_e2e.sh under the unit source-guard
|
||||
# (E2E_PRIORITY_UNIT_SOURCE=1 → it returns right after defining the counters
|
||||
# and evaluate_require_live_gate(), before _lib.sh / the live pre-sweep curl),
|
||||
# sets the counters to the scenario, calls the real gate, and echoes the
|
||||
# return code. Each case runs in a fresh `bash -c` so set -e/-u inside the
|
||||
# sourced script can't leak between cases or kill this harness.
|
||||
run_case() {
|
||||
local require_live="$1" validated="$2" failcount="$3"
|
||||
local observed
|
||||
E2E_PRIORITY_UNIT_SOURCE=1 \
|
||||
E2E_REQUIRE_LIVE="$require_live" \
|
||||
GATE_SCRIPT="$GATE_SCRIPT" \
|
||||
VAL="$validated" \
|
||||
FL="$failcount" \
|
||||
bash -c '
|
||||
set -uo pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "$GATE_SCRIPT" # returns at the source-guard (no platform I/O)
|
||||
VALIDATED="$VAL"
|
||||
FAIL="$FL"
|
||||
evaluate_require_live_gate >/dev/null 2>&1
|
||||
exit $?
|
||||
'
|
||||
observed=$?
|
||||
echo "$observed"
|
||||
}
|
||||
|
||||
assert_rc() {
|
||||
local label="$1" require_live="$2" validated="$3" failcount="$4" expected="$5"
|
||||
local observed
|
||||
observed=$(run_case "$require_live" "$validated" "$failcount")
|
||||
if [ "$observed" = "$expected" ]; then
|
||||
echo " ✓ $label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount → rc=$observed"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo " ✗ $label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount expected=$expected OBSERVED=$observed" >&2
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== E2E_REQUIRE_LIVE priority-runtimes zero-validated gate proof ==="
|
||||
echo " (drives the REAL evaluate_require_live_gate from $GATE_SCRIPT)"
|
||||
echo
|
||||
|
||||
# (a) DECISIVE false-green trap: REQUIRE_LIVE=1 + zero validated → RED (exit 1).
|
||||
assert_rc "require-live, zero validated → RED (the false-green trap)" \
|
||||
1 0 0 1
|
||||
|
||||
# (b) REQUIRE_LIVE=1 + at least one validated → GREEN (exit 0).
|
||||
assert_rc "require-live, one validated → GREEN" \
|
||||
1 1 0 0
|
||||
assert_rc "require-live, several validated → GREEN" \
|
||||
1 3 0 0
|
||||
|
||||
# (c) REQUIRE_LIVE unset-equivalent (0) + zero validated → GREEN (loud skip).
|
||||
assert_rc "no require-live, zero validated → GREEN (dev-convenience loud skip)" \
|
||||
0 0 0 0
|
||||
|
||||
# REQUIRE_LIVE=true (string form) is also honoured by the gate.
|
||||
assert_rc "require-live='true', zero validated → RED" \
|
||||
true 0 0 1
|
||||
|
||||
# A real FAIL is always red, regardless of REQUIRE_LIVE / VALIDATED — the
|
||||
# zero-validated guard must not mask (nor be masked by) a genuine failure.
|
||||
assert_rc "real FAIL with validations, no require-live → RED" \
|
||||
0 2 1 1
|
||||
assert_rc "real FAIL, zero validated, no require-live → RED" \
|
||||
0 0 1 1
|
||||
|
||||
echo
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -40,25 +40,9 @@
|
||||
# E2E_INTENTIONAL_FAILURE 1 → break a step on purpose to verify
|
||||
# the EXIT trap still tears down (mirrors
|
||||
# the full-saas harness's safety net).
|
||||
# E2E_REQUIRE_LIVE 1 → fail-closed if the harness exits 0
|
||||
# WITHOUT having driven all four
|
||||
# awaiting_agent transitions. CI sets this
|
||||
# so a future skip / early-return can never
|
||||
# masquerade as a green run. Mirrors CP
|
||||
# serving-e2e SERVING_E2E_REQUIRE_LIVE.
|
||||
# E2E_STALE_POLL_DEADLINE_SECS default 240. Upper bound for the
|
||||
# heartbeat-staleness READINESS poll (step
|
||||
# 6). Replaces the old fixed sleep+one-shot
|
||||
# assert that raced the sweep cadence.
|
||||
# E2E_TRANSIENT_RETRIES default 8. Bounded retries for register /
|
||||
# re-register against transient edge errors
|
||||
# (502/503/504 from Caddy during cold TLS /
|
||||
# agent boot). Mirrors the full-saas
|
||||
# cold-start retry loop — NOT a bare sleep.
|
||||
#
|
||||
# Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
|
||||
# 4 teardown leak, 5 REQUIRE_LIVE violation (exited 0 having validated
|
||||
# nothing).
|
||||
# 4 teardown leak.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -67,13 +51,6 @@ ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway s
|
||||
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
|
||||
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
|
||||
STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
|
||||
# Readiness-poll deadline for the sweep transition (step 6). Must exceed
|
||||
# STALE_WAIT_SECS (the no-heartbeat window) by at least one sweep
|
||||
# interval so a slightly-late sweep tick is polled-for, not misread as a
|
||||
# stuck 'online'. 240 = 180s window + 60s sweep-cadence headroom.
|
||||
STALE_POLL_DEADLINE_SECS="${E2E_STALE_POLL_DEADLINE_SECS:-240}"
|
||||
TRANSIENT_RETRIES="${E2E_TRANSIENT_RETRIES:-8}"
|
||||
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
|
||||
|
||||
SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
|
||||
SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
|
||||
@@ -82,66 +59,6 @@ log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
# REQUIRE_LIVE bookkeeping: count the four awaiting_agent transitions the
|
||||
# test is contracted to prove. The EXIT trap fails-closed (exit 5) if the
|
||||
# script reaches a clean exit without all four — so a silent skip, an
|
||||
# early `return 0`, or a refactor that drops a step can never show green.
|
||||
TRANSITIONS_VERIFIED=0
|
||||
EXPECTED_TRANSITIONS=4
|
||||
require_transition() { # $1 = human label
|
||||
TRANSITIONS_VERIFIED=$((TRANSITIONS_VERIFIED + 1))
|
||||
log " [require-live] transition ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} proven: $1"
|
||||
}
|
||||
|
||||
# Redact bearer tokens from any HTTP body before logging (mirrors the
|
||||
# full-saas sanitize_http_body so transient-error logs never leak creds).
|
||||
sanitize_http_body() {
|
||||
sed -E 's/(Bearer|token)[[:space:]]+[A-Za-z0-9._-]+/\1 REDACTED/g'
|
||||
}
|
||||
|
||||
# Bounded retry-on-transient for POST /registry/register. The tenant edge
|
||||
# (Caddy) returns 502/503/504 with an identifiable body while TLS / the
|
||||
# workspace agent finishes cold-booting — a single shot here was the
|
||||
# un-named flake (a transient edge error misread as a register failure).
|
||||
# This mirrors the full-saas cold-start loop (test_staging_full_saas.sh
|
||||
# ~L780-816): retry ONLY on a transient TRANSPORT class (5xx + body
|
||||
# match), bounded by TRANSIENT_RETRIES, and FAIL CLOSED (non-zero) once
|
||||
# the budget is spent. It deliberately does NOT retry on a 4xx — that's a
|
||||
# real contract bug (e.g. wrong payload field) and must stay red.
|
||||
# Sets REGISTER_RESP (body + trailing "HTTP_CODE=NNN" line) on success;
|
||||
# returns non-zero (caller `fail`s) when the bounded budget is exhausted.
|
||||
register_with_retry() { # $1 = step label, $2 = request body
|
||||
local label="$1" body="$2"
|
||||
local attempt code resp safe
|
||||
for attempt in $(seq 1 "$TRANSIENT_RETRIES"); do
|
||||
set +e
|
||||
resp=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST \
|
||||
"$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$body")
|
||||
set -e
|
||||
code=$(printf '%s' "$resp" | sed -n 's/^HTTP_CODE=//p' | tail -n1)
|
||||
code=${code:-000}
|
||||
if [ "$code" = "200" ]; then
|
||||
REGISTER_RESP="$resp"
|
||||
return 0
|
||||
fi
|
||||
safe=$(printf '%s' "$resp" | sanitize_http_body | head -c 300)
|
||||
# Retry ONLY on a transient transport class; a 4xx is a real bug.
|
||||
if echo "$code" | grep -Eq '^(502|503|504)$' \
|
||||
&& echo "$safe" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream'; then
|
||||
log " ${label} transient $code attempt ${attempt}/${TRANSIENT_RETRIES}: $safe"
|
||||
[ "$attempt" -lt "$TRANSIENT_RETRIES" ] && { sleep 10; continue; }
|
||||
fi
|
||||
# Non-transient (4xx, or unrecognized 5xx body): stop and fail closed.
|
||||
REGISTER_RESP="$resp"
|
||||
return 1
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
|
||||
# ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
|
||||
@@ -181,19 +98,8 @@ cleanup_org() {
|
||||
fi
|
||||
ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
|
||||
|
||||
# REQUIRE_LIVE fail-closed gate. Only meaningful on an OTHERWISE-CLEAN
|
||||
# exit (entry_rc==0): a script that completed all steps but somehow did
|
||||
# not register all four transitions (a skip, an early return, a dropped
|
||||
# assertion in a refactor) must NOT report success. A non-zero entry_rc
|
||||
# already carries its own failure semantics — don't mask it with 5.
|
||||
if [ "$entry_rc" = "0" ] && [ "${REQUIRE_LIVE}" = "1" ] \
|
||||
&& [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
|
||||
echo "❌ REQUIRE_LIVE: exited 0 but only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} awaiting_agent transitions were proven — refusing to report green." >&2
|
||||
exit 5
|
||||
fi
|
||||
|
||||
case "$entry_rc" in
|
||||
0|1|2|3|4|5) ;;
|
||||
0|1|2|3|4) ;;
|
||||
*) exit 1 ;;
|
||||
esac
|
||||
}
|
||||
@@ -342,7 +248,6 @@ GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
|
||||
ok "DB row stored as awaiting_agent (proof migration 046 applied)"
|
||||
require_transition "create: provisioning → awaiting_agent (DB-verified)"
|
||||
|
||||
# ─── 5. Register the workspace (transitions to online) ──────────────────
|
||||
# Pre-fix this path was actually fine because it writes 'online', a value
|
||||
@@ -372,20 +277,20 @@ log "5/8 Registering workspace via /registry/register..."
|
||||
# url — accepted but not dispatched-to in poll mode, so
|
||||
# example.invalid is a valid sentinel.
|
||||
REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
|
||||
# Bounded retry-on-transient (see register_with_retry). The previous
|
||||
# single-shot here would `fail` on a cold-boot 502 from the tenant edge —
|
||||
# an un-named transient misread as a register break. The helper retries
|
||||
# ONLY that class and fails closed on a real 4xx or an exhausted budget.
|
||||
REGISTER_RESP=""
|
||||
register_with_retry "register" "$REGISTER_BODY" \
|
||||
|| fail "register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
log " register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
# Disable --fail-with-body for this one call so a 4xx surfaces the response
|
||||
# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
|
||||
REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$REGISTER_BODY") || true
|
||||
log " register response: $(echo "$REGISTER_RESP" | head -c 300)"
|
||||
echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
|
||||
ok "Workspace transitioned to online"
|
||||
require_transition "register: awaiting_agent → online"
|
||||
|
||||
# Confirm the register handler echoed back delivery_mode=poll. We read
|
||||
# this from the register RESPONSE, not the workspace GET response, because
|
||||
@@ -405,63 +310,38 @@ fi
|
||||
# This is the SECOND silent-failure path (registry/healthsweep.go's
|
||||
# sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
|
||||
# UPDATE silently failed and the workspace stuck on 'online' forever
|
||||
# even though no agent was alive.
|
||||
#
|
||||
# FLAKE FIX (named: sweep-cadence race). The old code did a FIXED
|
||||
# `sleep $STALE_WAIT_SECS` then a SINGLE assert. The staleness sweep is a
|
||||
# periodic tick (REMOTE_LIVENESS_STALE_AFTER + a sweep interval); if the
|
||||
# tick that flips the row lands even one second after the fixed sleep, the
|
||||
# one-shot GET reads 'online' and the test fails — a real transition,
|
||||
# misread as a flake because the assert was racing the sweep cadence.
|
||||
# Replace with: sleep through the mandatory no-heartbeat window ONCE (the
|
||||
# sweep cannot fire before the window elapses, so polling earlier is
|
||||
# pointless), then READINESS-POLL for the awaiting_agent transition up to
|
||||
# STALE_POLL_DEADLINE_SECS, hard-failing with a clear message at the
|
||||
# deadline. Deterministic: a slow-but-working sweep passes; a genuinely
|
||||
# stuck 'online' still fails (now with how long we actually waited).
|
||||
log "6/8 Waiting ${STALE_WAIT_SECS}s no-heartbeat window, then polling for sweep (up to ${STALE_POLL_DEADLINE_SECS}s total)..."
|
||||
[ "$STALE_POLL_DEADLINE_SECS" -le "$STALE_WAIT_SECS" ] && \
|
||||
fail "Misconfigured: STALE_POLL_DEADLINE_SECS ($STALE_POLL_DEADLINE_SECS) must exceed STALE_WAIT_SECS ($STALE_WAIT_SECS) by at least one sweep interval"
|
||||
# even though no agent was alive. We wait the full window + a sweep
|
||||
# interval and assert the row transitions back to 'awaiting_agent'.
|
||||
log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
|
||||
sleep "$STALE_WAIT_SECS"
|
||||
|
||||
STALE_DEADLINE=$(( $(date +%s) + (STALE_POLL_DEADLINE_SECS - STALE_WAIT_SECS) ))
|
||||
STALE_STATUS=""
|
||||
while true; do
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$STALE_STATUS" = "awaiting_agent" ] && break
|
||||
if [ "$(date +%s)" -gt "$STALE_DEADLINE" ]; then
|
||||
fail "After ${STALE_POLL_DEADLINE_SECS}s with no heartbeat, status still '$STALE_STATUS' (expected awaiting_agent sweep transition) — migration 046 likely not applied OR sweep not running"
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$STALE_STATUS" != "awaiting_agent" ] && \
|
||||
fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
|
||||
ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
|
||||
require_transition "sweep: online → awaiting_agent (no heartbeat)"
|
||||
|
||||
# ─── 7. Re-register and confirm we can come back online ─────────────────
|
||||
# This proves the awaiting_agent state is recoverable (re-registrable),
|
||||
# which is the whole point of using it instead of 'offline'.
|
||||
log "7/8 Re-registering after stale → confirming recovery to online..."
|
||||
# Same payload contract as step 5 (id + agent_card both required). See note
|
||||
# there for why workspace_id would 400. Same bounded retry-on-transient.
|
||||
REGISTER_RESP=""
|
||||
register_with_retry "re-register" "$REGISTER_BODY" \
|
||||
|| fail "re-register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
log " re-register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
# there for why workspace_id would 400.
|
||||
REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$REGISTER_BODY") || true
|
||||
log " re-register response: $(echo "$REREG_RESP" | head -c 300)"
|
||||
echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$RECOVERED_STATUS" != "online" ] && \
|
||||
fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
|
||||
ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
|
||||
require_transition "re-register: awaiting_agent → online (recovery)"
|
||||
|
||||
# ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
|
||||
# REQUIRE_LIVE belt-and-braces: assert here too (in addition to the EXIT
|
||||
# trap) so the failure surfaces in step order, not only post-teardown.
|
||||
if [ "${REQUIRE_LIVE}" = "1" ] && [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
|
||||
fail "REQUIRE_LIVE: only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} transitions proven at end of run"
|
||||
fi
|
||||
log "8/8 All four awaiting_agent transitions verified."
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
ok "External-runtime E2E PASSED on $SLUG"
|
||||
|
||||
@@ -47,15 +47,6 @@
|
||||
# tear down cleanly (and exit 4 on leak).
|
||||
# Used by a dedicated sanity workflow
|
||||
# that verifies the safety net.
|
||||
# E2E_REQUIRE_LIVE 1 → fail-closed-on-skip guard (CI sets this).
|
||||
# When set, the run MUST actually complete
|
||||
# ≥1 full provision→online→A2A cycle. A run
|
||||
# that reaches the end without having proven
|
||||
# a real round-trip (e.g. a future refactor
|
||||
# short-circuits a stage, or a skip path
|
||||
# swallows the lifecycle) exits 5 rather than
|
||||
# reporting a false green. Mirrors CP
|
||||
# serving-e2e's SERVING_E2E_REQUIRE_LIVE.
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 happy path
|
||||
@@ -63,37 +54,6 @@
|
||||
# 2 missing required env
|
||||
# 3 provisioning timed out
|
||||
# 4 teardown left orphan resources
|
||||
# 5 E2E_REQUIRE_LIVE set but the run validated no real lifecycle (no
|
||||
# false-green-on-skip)
|
||||
#
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# PROMOTION-READINESS (harden/e2e-staging-saas-failclosed):
|
||||
# This harness is being hardened so `E2E Staging SaaS` + `E2E Staging
|
||||
# Platform Boot` can become HARD merge-gates. continue-on-error is NOT
|
||||
# flipped here — that promotion is the CTO's irreversible branch-protection
|
||||
# call. What this branch makes fail-closed (was false-green / un-named
|
||||
# flake before):
|
||||
# • Provision/online waits are bounded readiness-POLLS, not fixed sleeps;
|
||||
# each hard-fails with a named mechanism + last-seen signal on deadline,
|
||||
# never a silent timeout (cp#245 boot-timeout class).
|
||||
# • Peer-discovery (9b) asserts a real 2xx, not just "not 404" — a 5xx /
|
||||
# 000 / empty no longer reads as "reachable".
|
||||
# • Activity-log (9b) is ASSERTED reachable (2xx + parseable), not
|
||||
# logged-and-ignored behind `|| echo '[]'`.
|
||||
# • Child activity provenance (10) is asserted (was soft-logged).
|
||||
# • E2E_REQUIRE_LIVE=1 (CI) makes the run exit 5 if it reached the end
|
||||
# without proving a real provision→online→A2A round-trip — no
|
||||
# false-green-on-skip.
|
||||
# STILL BLOCKS making it REQUIRED (must clear before the CTO flips
|
||||
# continue-on-error→false in .gitea/workflows/e2e-staging-saas.yml):
|
||||
# • De-flake window: N consecutive green runs on main for BOTH jobs
|
||||
# (platform-boot shares the cp#245 boot surface — #2187 tracks its
|
||||
# flip). This harness removes the harness-side flake mechanisms; the
|
||||
# remaining surface is real-infra (EC2 cold boot, CF DNS) latency,
|
||||
# already bounded by the readiness polls above.
|
||||
# • Branch-protection required-context wiring is a repo-settings change,
|
||||
# not a code change in this PR.
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -130,41 +90,6 @@ log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
# ─── fail-closed-on-skip live-lifecycle guard ───────────────────────────
|
||||
# E2E_REQUIRE_LIVE=1 (set by CI) asserts this run ACTUALLY exercised a full
|
||||
# provision→online→A2A cycle. Each load-bearing lifecycle stage stamps a
|
||||
# milestone via live_milestone(); at the very end, require_live_or_die()
|
||||
# checks every required milestone fired. Mechanism: without this, a future
|
||||
# refactor that short-circuits a stage — or a skip/early-return path that
|
||||
# swallows the lifecycle — would let the script reach its final `ok` and
|
||||
# report GREEN having validated nothing. Mirrors CP serving-e2e's
|
||||
# SERVING_E2E_REQUIRE_LIVE (skip-if-absent must be LOUD, never silent green).
|
||||
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
|
||||
LIVE_MILESTONES=""
|
||||
live_milestone() {
|
||||
# Idempotent set-membership append. Space-delimited; names are tokens.
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $1 "*) ;;
|
||||
*) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
|
||||
esac
|
||||
}
|
||||
require_live_or_die() {
|
||||
# No-op unless CI demanded a live run.
|
||||
[ "$REQUIRE_LIVE" = "1" ] || return 0
|
||||
local required="provisioned tenant_online workspace_online a2a_roundtrip"
|
||||
local m missing=""
|
||||
for m in $required; do
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $m "*) ;;
|
||||
*) missing="$missing $m" ;;
|
||||
esac
|
||||
done
|
||||
if [ -n "$missing" ]; then
|
||||
echo "[$(date +%H:%M:%S)] ❌ E2E_REQUIRE_LIVE=1 but the run did NOT prove a full live lifecycle — missing milestone(s):${missing}. Reached:${LIVE_MILESTONES:-<none>}. This is a false-green-on-skip guard: a run that validates no real provision→online→A2A cycle MUST NOT report green." >&2
|
||||
exit 5
|
||||
fi
|
||||
}
|
||||
|
||||
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
|
||||
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
|
||||
# without booting the full 11-step lifecycle.
|
||||
@@ -272,7 +197,7 @@ cleanup_org() {
|
||||
# case statement, and opens a false-positive priority-high
|
||||
# "safety net broken" issue (#2159, 2026-04-27).
|
||||
case "$entry_rc" in
|
||||
0|1|2|3|4|5) ;; # contracted codes — let bash use entry_rc
|
||||
0|1|2|3|4) ;; # contracted codes — let bash use entry_rc
|
||||
*) exit 1 ;; # anything else is a generic failure
|
||||
esac
|
||||
}
|
||||
@@ -370,7 +295,6 @@ print('(no org row found for slug=$SLUG — DB drift?)')
|
||||
esac
|
||||
done
|
||||
ok "Tenant provisioning complete"
|
||||
live_milestone provisioned
|
||||
|
||||
# Derive tenant domain from CP hostname so the same harness works in
|
||||
# both prod (api.moleculesai.app → moleculesai.app) and staging
|
||||
@@ -427,7 +351,6 @@ while true; do
|
||||
sleep 5
|
||||
done
|
||||
ok "Tenant reachable at $TENANT_URL"
|
||||
live_milestone tenant_online
|
||||
|
||||
# Sanity-test path: once the tenant is provisioned, poisoning the
|
||||
# tenant token proves the EXIT trap + leak assertion still fire.
|
||||
@@ -553,19 +476,7 @@ wait_workspaces_online_routable() {
|
||||
# All empty → '{}' (workspace will fail at first turn with an
|
||||
# expected, actionable auth error rather than masking the test).
|
||||
SECRETS_JSON='{}'
|
||||
# Platform-managed path (E2E_LLM_PATH=platform) — the moonshot/kimi
|
||||
# NOT_CONFIGURED regression (RFC#340 Fix A #2187). Molecule owns billing via the
|
||||
# CP LLM proxy, so the workspace needs NO tenant key: provision with empty
|
||||
# secrets and let the workspace boot purely on (a) the proxy env the control
|
||||
# plane injects + (b) the manifest-derived `provider: platform` Fix A stamps into
|
||||
# the generated config.yaml. This is the path that booted NOT_CONFIGURED in prod
|
||||
# precisely because the BYOK branches below never exercise it. We deliberately
|
||||
# skip the key-injection branches so a stray E2E_*_API_KEY in the runner env
|
||||
# cannot silently convert this into a BYOK run and mask the regression.
|
||||
if [ "${E2E_LLM_PATH:-}" = "platform" ]; then
|
||||
log " LLM path: PLATFORM-MANAGED (no tenant key; proxy + Fix A provider stamp)"
|
||||
SECRETS_JSON='{}'
|
||||
elif [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "
|
||||
import json, os
|
||||
k = os.environ['E2E_MINIMAX_API_KEY']
|
||||
@@ -647,7 +558,6 @@ fi
|
||||
WS_TO_CHECK=("$PARENT_ID")
|
||||
[ -n "$CHILD_ID" ] && WS_TO_CHECK+=("$CHILD_ID")
|
||||
wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min — hermes cold boot)..." "${WS_TO_CHECK[@]}"
|
||||
live_milestone workspace_online
|
||||
|
||||
# ─── 7a. Real chat image upload/download round-trip ───────────────────
|
||||
# This deliberately uses the production workflow: tenant admin/session auth
|
||||
@@ -948,24 +858,6 @@ fi
|
||||
if echo "$AGENT_TEXT" | grep -qiE "exceeded your current quota|insufficient_quota"; then
|
||||
fail "A2A — PROVIDER QUOTA EXHAUSTED (NOT a platform regression). Operator action: top up MOLECULE_STAGING_OPENAI_API_KEY billing or rotate to a higher-quota org at Settings → Secrets and Variables → Actions. Tracked in #2578. Raw: $AGENT_TEXT"
|
||||
fi
|
||||
# Empty-completion class — the agent runtime reached the LLM and got a
|
||||
# 2xx back, but the assistant turn carried NO text part (empty content,
|
||||
# or tool_calls/reasoning-only with no surfaced text), so the runtime
|
||||
# returns the literal "Error: message contained no text content." as its
|
||||
# reply text. Steps 0-7 passing means the platform is healthy (CP up,
|
||||
# tenant provisioned, workspace online + routable, A2A delivery e2e); the
|
||||
# break is the configured completion BACKEND returning an empty turn — a
|
||||
# model/provider-side regression, NOT a workspace-server or harness bug,
|
||||
# and NOT NOT_CONFIGURED (that fails earlier, at boot). Name it explicitly
|
||||
# so the canary alert points at the model, not the platform: a generic
|
||||
# "error-shaped response" misdirects triage to workspace-server. Observed
|
||||
# 2026-06-03/04 across every staging canary on MODEL_SLUG=MiniMax-M2 (the
|
||||
# canary default since #2710) — 100% on the parent's first cold turn,
|
||||
# identical on main's scheduled synthetic E2E and on PRs (so it is an
|
||||
# environmental backend regression, never PR-introduced).
|
||||
if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
|
||||
fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code default is minimax:MiniMax-M2.7 since #2263; was bare MiniMax-M2 #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
|
||||
fi
|
||||
# Generic catch-all — falls through if none of the known regressions hit.
|
||||
if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
|
||||
fail "A2A returned an error-shaped response: $AGENT_TEXT"
|
||||
@@ -1030,14 +922,7 @@ for KA_ATTEMPT in $(seq 1 6); do
|
||||
KA_SAFE_BODY=$(printf '%s' "$KA_RESP" | sanitize_http_body)
|
||||
# Retry ONLY on transient transport errors — never on an agent-level
|
||||
# error (those must surface and fail the gate).
|
||||
# #2263: include the Cloudflare-shaped literal `error code: 502/504` token so a
|
||||
# bare edge/gateway 502 (no "Bad Gateway" body) is retried here the same way the
|
||||
# cold-start PONG probe (line ~800) and the delegation loop (line ~1234) already
|
||||
# do. Without it, a single un-retried edge 502 right after a healthy round-trip
|
||||
# fell through to break and failed the gate on the first attempt (Platform Boot
|
||||
# job, task 268859). Bounded by the existing 6-attempt / sleep-10 loop — no new
|
||||
# sleep-as-fix; this only widens the transient-match to the sibling pattern.
|
||||
if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
|
||||
if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
|
||||
log " known-answer A2A transient $KA_CODE attempt $KA_ATTEMPT/6: $KA_SAFE_BODY"
|
||||
if [ "$KA_ATTEMPT" -lt 6 ]; then sleep 10; continue; fi
|
||||
fi
|
||||
@@ -1059,11 +944,6 @@ except Exception:
|
||||
" 2>/dev/null || echo "")
|
||||
# CORE GATE: contains PINEAPPLE (real round-trip) AND no error-as-text.
|
||||
a2a_assert_real_completion "$KA_TEXT" "PINEAPPLE" "A2A known-answer (parent, $RUNTIME/$MODEL_SLUG)"
|
||||
# Real, deterministic LLM round-trip proven — the load-bearing milestone for
|
||||
# the fail-closed-on-skip guard. Stamped AFTER a2a_assert_real_completion (not
|
||||
# after the looser PONG check) so the milestone means a verified completion,
|
||||
# not just a 2xx-with-text.
|
||||
live_milestone a2a_roundtrip
|
||||
|
||||
# ─── 8c. byok-routing regression guard (#1994) ─────────────────────────
|
||||
# The parent was provisioned with the customer's OWN vendor key
|
||||
@@ -1131,7 +1011,7 @@ print(json.dumps({
|
||||
'messageId': f'e2e-{uuid.uuid4().hex[:8]}',
|
||||
'parts': [{'kind': 'text', 'text': 'Reply with exactly: ok'}],
|
||||
},
|
||||
'configuration': {'max_tokens': 32}
|
||||
'configuration': {'max_tokens': 4}
|
||||
}
|
||||
}))
|
||||
")
|
||||
@@ -1189,50 +1069,18 @@ print(json.dumps({
|
||||
ok "HMA memory write+read roundtripped"
|
||||
|
||||
log "9b. Peer discovery + activity log smoke..."
|
||||
# FAIL-CLOSED: assert a real 2xx, not merely "not 404". The previous
|
||||
# `[ "$PEERS_CODE" = "404" ] && fail` only caught the route-missing case —
|
||||
# a 5xx, 000 (connection failure), or empty capture ALL fell through to
|
||||
# "reachable" (false-green: a broken-but-present route read as healthy).
|
||||
# Mechanism: route the http_code into its own tempfile (no stderr capture,
|
||||
# which the old `2>&1 | head -1` could pollute with a curl error line) and
|
||||
# require 2xx explicitly.
|
||||
PEERS_TMP=$(e2e_tmp /tmp/e2e_peers.XXXXXX)
|
||||
set +e
|
||||
PEERS_CODE=$(tenant_call GET "/registry/$PARENT_ID/peers" \
|
||||
-o "$PEERS_TMP" -w "%{http_code}" 2>/dev/null)
|
||||
PEERS_RC=$?
|
||||
tenant_call GET "/registry/$PARENT_ID/peers" -o /dev/null -w "%{http_code}\n" 2>&1 | head -1 > /tmp/peers_code.txt
|
||||
set -e
|
||||
PEERS_CODE=${PEERS_CODE:-000}
|
||||
if [ "$PEERS_CODE" = "404" ]; then
|
||||
fail "Peers endpoint missing (404) — route regression. /registry/$PARENT_ID/peers"
|
||||
fi
|
||||
if [ "$PEERS_RC" != "0" ] || [ "$PEERS_CODE" -lt 200 ] || [ "$PEERS_CODE" -ge 300 ]; then
|
||||
fail "Peers endpoint unhealthy (curl_rc=$PEERS_RC, http=$PEERS_CODE) — not a clean 2xx, so 'reachable' would be a false-green. Body: $(head -c 200 "$PEERS_TMP" 2>/dev/null | sanitize_http_body)"
|
||||
fi
|
||||
PEERS_CODE=$(cat /tmp/peers_code.txt)
|
||||
[ "$PEERS_CODE" = "404" ] && fail "Peers endpoint missing (404) — route regression"
|
||||
ok "Peers endpoint reachable (HTTP $PEERS_CODE)"
|
||||
|
||||
# FAIL-CLOSED: the activity-log read was `|| echo '[]'` then the count was
|
||||
# only LOGGED, never asserted — a 5xx / network failure silently became an
|
||||
# empty list and the step exited 0 having validated nothing (false-green:
|
||||
# "validated nothing" class). Assert the endpoint returns a 2xx and a
|
||||
# parseable activity shape. We do NOT assert count>0 (the parent may
|
||||
# legitimately have 0 events this early — that's a real, valid state), but
|
||||
# we DO require the call to have actually succeeded and returned valid JSON.
|
||||
ACTIVITY_TMP=$(e2e_tmp /tmp/e2e_activity.XXXXXX)
|
||||
set +e
|
||||
ACTIVITY_CODE=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" \
|
||||
-o "$ACTIVITY_TMP" -w "%{http_code}" 2>/dev/null)
|
||||
ACTIVITY_RC=$?
|
||||
set -e
|
||||
ACTIVITY_CODE=${ACTIVITY_CODE:-000}
|
||||
if [ "$ACTIVITY_RC" != "0" ] || [ "$ACTIVITY_CODE" -lt 200 ] || [ "$ACTIVITY_CODE" -ge 300 ]; then
|
||||
fail "Activity-log endpoint unhealthy (curl_rc=$ACTIVITY_RC, http=$ACTIVITY_CODE) — was previously swallowed by '|| echo []' and reported as 0 events (false-green). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
|
||||
fi
|
||||
ACTIVITY_COUNT=$(python3 -c "import json,sys
|
||||
d=json.load(open(sys.argv[1]))
|
||||
print(len(d if isinstance(d, list) else d.get('events', [])))" "$ACTIVITY_TMP" 2>/dev/null) \
|
||||
|| fail "Activity-log returned HTTP $ACTIVITY_CODE but body was not parseable JSON (events array / {events:[...]}). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
|
||||
log " Activity events observed: $ACTIVITY_COUNT (endpoint 2xx + parseable ✓)"
|
||||
ACTIVITY=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" 2>/dev/null || echo '[]')
|
||||
ACTIVITY_COUNT=$(echo "$ACTIVITY" | python3 -c "import json,sys
|
||||
d=json.load(sys.stdin)
|
||||
print(len(d if isinstance(d, list) else d.get('events', [])))" 2>/dev/null || echo 0)
|
||||
log " Activity events observed: $ACTIVITY_COUNT"
|
||||
|
||||
# ─── 9c. Workspace KV memory Edit round-trip ─────────────────────────
|
||||
# Pins the Edit affordance added to the canvas Memory tab. The UI calls
|
||||
@@ -1383,44 +1231,14 @@ except Exception:
|
||||
[ -z "$DELEG_TEXT" ] && fail "Delegation returned no text. Raw: ${DELEG_RESP:0:200}"
|
||||
ok "Delegation proxy works (child responded: \"${DELEG_TEXT:0:60}\")"
|
||||
|
||||
# FAIL-CLOSED via bounded readiness-POLL (was soft-logged false-green).
|
||||
# The activity pipeline is async, so an immediate single read can miss the
|
||||
# parent reference — but "did not reference parent" was previously just
|
||||
# LOGGED and the step passed regardless, so a genuinely broken provenance
|
||||
# pipeline (parent never recorded as source) read as success. Mechanism:
|
||||
# poll the child activity log for the parent id for a bounded window
|
||||
# (E2E_CHILD_ACTIVITY_TIMEOUT_SECS, default 60s) — this is the real
|
||||
# readiness signal (provenance row materialised), not a fixed sleep — and
|
||||
# hard-fail with a named mechanism if it never appears.
|
||||
CHILD_ACT_DEADLINE=$(( $(date +%s) + ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60} ))
|
||||
CHILD_ACT_SEEN=0
|
||||
CHILD_ACT_LASTCODE="000"
|
||||
while true; do
|
||||
CHILD_ACT_TMP=$(e2e_tmp /tmp/e2e_child_act.XXXXXX)
|
||||
set +e
|
||||
CHILD_ACT_CODE=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" \
|
||||
-o "$CHILD_ACT_TMP" -w "%{http_code}" 2>/dev/null)
|
||||
set -e
|
||||
CHILD_ACT_LASTCODE=${CHILD_ACT_CODE:-000}
|
||||
if grep -q "$PARENT_ID" "$CHILD_ACT_TMP" 2>/dev/null; then
|
||||
CHILD_ACT_SEEN=1
|
||||
break
|
||||
fi
|
||||
[ "$(date +%s)" -ge "$CHILD_ACT_DEADLINE" ] && break
|
||||
sleep 5
|
||||
done
|
||||
if [ "$CHILD_ACT_SEEN" = "1" ]; then
|
||||
CHILD_ACT=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" 2>/dev/null || echo '[]')
|
||||
if echo "$CHILD_ACT" | grep -q "$PARENT_ID"; then
|
||||
ok "Child activity log records parent as source"
|
||||
else
|
||||
fail "Child activity log never referenced parent $PARENT_ID within ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60}s (last http=$CHILD_ACT_LASTCODE) — delegation-provenance pipeline regression (parent not recorded as source). Previously soft-logged → false-green."
|
||||
log "Child activity log did not reference parent (pipeline may be async)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ─── 11. Teardown runs via trap ────────────────────────────────────────
|
||||
# Fail-closed-on-skip: before declaring PASS, assert (when CI demanded a live
|
||||
# run) that every load-bearing lifecycle milestone actually fired. A run that
|
||||
# reaches here without provision→online→A2A having truly happened exits 5
|
||||
# instead of reporting green. Teardown still runs (EXIT trap) on that exit.
|
||||
require_live_or_die
|
||||
log "11/11 All checks passed. Teardown runs via EXIT trap."
|
||||
ok "═══ STAGING $MODE-SAAS E2E PASSED ═══"
|
||||
|
||||
@@ -18,7 +18,6 @@ No network. No live Gitea calls.
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
@@ -118,31 +117,6 @@ def _write_audit_yaml(tmp_path: Path, required_checks: list[str]) -> Path:
|
||||
return p
|
||||
|
||||
|
||||
def _write_audit_yaml_json(tmp_path: Path, required_checks_json: dict) -> Path:
|
||||
"""Write a synthetic audit-force-merge.yml with REQUIRED_CHECKS_JSON env."""
|
||||
block = json.dumps(required_checks_json, indent=2)
|
||||
text = textwrap.dedent(
|
||||
f"""\
|
||||
name: audit-force-merge
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/30 * * * *'
|
||||
jobs:
|
||||
audit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Run audit
|
||||
env:
|
||||
REQUIRED_CHECKS_JSON: |
|
||||
{block.replace(chr(10), chr(10) + ' ')}
|
||||
run: bash .gitea/scripts/audit-force-merge.sh
|
||||
"""
|
||||
)
|
||||
p = tmp_path / "audit-force-merge.yml"
|
||||
p.write_text(text, encoding="utf-8")
|
||||
return p
|
||||
|
||||
|
||||
def _make_stub_api(responses: dict):
|
||||
"""Build a fake `api()` callable.
|
||||
|
||||
@@ -389,107 +363,6 @@ def test_happy_path_no_drift(drift_module, tmp_path, monkeypatch):
|
||||
assert findings == [], findings
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# REQUIRED_CHECKS_JSON variant drift tests
|
||||
# --------------------------------------------------------------------------
|
||||
def test_f3a_env_wider_than_protection_json_variant(drift_module, tmp_path, monkeypatch):
|
||||
"""F3a: REQUIRED_CHECKS_JSON env has a context NOT in protection."""
|
||||
ci = _write_ci_yaml(
|
||||
tmp_path,
|
||||
jobs={"build": {"runs-on": "ubuntu-latest"}},
|
||||
sentinel_needs=["build"],
|
||||
)
|
||||
audit = _write_audit_yaml_json(
|
||||
tmp_path,
|
||||
{"main": ["ci / build (pull_request)", "ci / ghost (pull_request)"]},
|
||||
)
|
||||
_patch_paths(drift_module, monkeypatch, ci, audit)
|
||||
|
||||
stub = _make_stub_api({
|
||||
("GET", "/repos/owner/repo/branch_protections/main"): (
|
||||
200,
|
||||
{"status_check_contexts": ["ci / build (pull_request)"]},
|
||||
),
|
||||
})
|
||||
monkeypatch.setattr(drift_module, "api", stub)
|
||||
|
||||
findings, _ = drift_module.detect_drift("main")
|
||||
assert any("F3a" in f and "ghost" in f for f in findings), findings
|
||||
|
||||
|
||||
def test_f3b_protection_wider_than_env_json_variant(drift_module, tmp_path, monkeypatch):
|
||||
"""F3b: protection has a context NOT in REQUIRED_CHECKS_JSON env."""
|
||||
ci = _write_ci_yaml(
|
||||
tmp_path,
|
||||
jobs={
|
||||
"build": {"runs-on": "ubuntu-latest"},
|
||||
"test": {"runs-on": "ubuntu-latest"},
|
||||
},
|
||||
sentinel_needs=["build", "test"],
|
||||
)
|
||||
audit = _write_audit_yaml_json(
|
||||
tmp_path,
|
||||
{"main": ["ci / build (pull_request)"]},
|
||||
)
|
||||
_patch_paths(drift_module, monkeypatch, ci, audit)
|
||||
|
||||
stub = _make_stub_api({
|
||||
("GET", "/repos/owner/repo/branch_protections/main"): (
|
||||
200,
|
||||
{
|
||||
"status_check_contexts": [
|
||||
"ci / build (pull_request)",
|
||||
"ci / test (pull_request)",
|
||||
]
|
||||
},
|
||||
),
|
||||
})
|
||||
monkeypatch.setattr(drift_module, "api", stub)
|
||||
|
||||
findings, _ = drift_module.detect_drift("main")
|
||||
assert any("F3b" in f and "ci / test (pull_request)" in f for f in findings), findings
|
||||
|
||||
|
||||
def test_happy_path_no_drift_json_variant(drift_module, tmp_path, monkeypatch):
|
||||
"""Happy path with REQUIRED_CHECKS_JSON: all aligned."""
|
||||
ci = _write_ci_yaml(
|
||||
tmp_path,
|
||||
jobs={
|
||||
"build": {"runs-on": "ubuntu-latest"},
|
||||
"test": {"runs-on": "ubuntu-latest"},
|
||||
},
|
||||
sentinel_needs=["build", "test"],
|
||||
)
|
||||
audit = _write_audit_yaml_json(
|
||||
tmp_path,
|
||||
{
|
||||
"main": [
|
||||
"ci / build (pull_request)",
|
||||
"ci / test (pull_request)",
|
||||
"ci / all-required (pull_request)",
|
||||
]
|
||||
},
|
||||
)
|
||||
_patch_paths(drift_module, monkeypatch, ci, audit)
|
||||
|
||||
stub = _make_stub_api({
|
||||
("GET", "/repos/owner/repo/branch_protections/main"): (
|
||||
200,
|
||||
{
|
||||
"status_check_contexts": [
|
||||
"ci / build (pull_request)",
|
||||
"ci / test (pull_request)",
|
||||
"ci / all-required (pull_request)",
|
||||
]
|
||||
},
|
||||
),
|
||||
})
|
||||
monkeypatch.setattr(drift_module, "api", stub)
|
||||
|
||||
findings, _ = drift_module.detect_drift("main")
|
||||
assert findings == [], findings
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# MUST-FIX 1: find_open_issue must raise on transient HTTP errors
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
@@ -2,15 +2,10 @@ package main
|
||||
|
||||
import "testing"
|
||||
|
||||
// TestResolveBindHost pins the precedence: BIND_ADDR explicit > local-dev
|
||||
// loopback default of 127.0.0.1 > production-shape empty (all interfaces).
|
||||
// TestResolveBindHost pins the precedence: BIND_ADDR explicit > dev-mode
|
||||
// fail-open default of 127.0.0.1 > production-shape empty (all interfaces).
|
||||
//
|
||||
// (harden/no-fail-open-auth) The loopback default is now keyed on
|
||||
// MOLECULE_ENV alone (IsLocalDevEnv), decoupled from ADMIN_TOKEN — a dev box
|
||||
// defaults to loopback even when it provisions an ADMIN_TOKEN. This is
|
||||
// defense-in-depth, not an auth lever; auth is fail-closed in every env.
|
||||
//
|
||||
// Mutation-test invariant: removing the IsLocalDevEnv() branch makes
|
||||
// Mutation-test invariant: removing the IsDevModeFailOpen() branch makes
|
||||
// "no_bindaddr_devmode_unset_admin" fail (returns "" instead of "127.0.0.1").
|
||||
// Removing the BIND_ADDR branch makes "explicit_bindaddr_*" cases fail.
|
||||
func TestResolveBindHost(t *testing.T) {
|
||||
@@ -40,10 +35,7 @@ func TestResolveBindHost(t *testing.T) {
|
||||
bindAddr: "",
|
||||
adminToken: "secret",
|
||||
molEnv: "dev",
|
||||
// harden/no-fail-open-auth: loopback default is keyed on
|
||||
// MOLECULE_ENV alone now — a dev box defaults to loopback even
|
||||
// with ADMIN_TOKEN provisioned (which dev-start.sh now does).
|
||||
want: "127.0.0.1",
|
||||
want: "", // ADMIN_TOKEN flips IsDevModeFailOpen to false → all interfaces
|
||||
},
|
||||
{
|
||||
name: "no_bindaddr_production_env",
|
||||
|
||||
@@ -26,12 +26,11 @@ import (
|
||||
// the update cycle — no ssh, no re-provision, no ops toil.
|
||||
//
|
||||
// Contract (paired with cp-side GET /cp/tenants/config):
|
||||
//
|
||||
// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config
|
||||
// Authorization: Bearer <ADMIN_TOKEN>
|
||||
// X-Molecule-Org-Id: <MOLECULE_ORG_ID>
|
||||
// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …}
|
||||
// 401 on bearer mismatch or unknown org
|
||||
// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config
|
||||
// Authorization: Bearer <ADMIN_TOKEN>
|
||||
// X-Molecule-Org-Id: <MOLECULE_ORG_ID>
|
||||
// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …}
|
||||
// 401 on bearer mismatch or unknown org
|
||||
//
|
||||
// Best-effort: any failure logs and returns — main() keeps booting.
|
||||
// Self-hosted deploys without MOLECULE_ORG_ID or ADMIN_TOKEN set
|
||||
@@ -106,53 +105,3 @@ func refreshEnvFromCP() error {
|
||||
log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base)
|
||||
return nil
|
||||
}
|
||||
|
||||
// requiredLLMEnvVars is the set of LLM proxy env vars a managed SaaS
|
||||
// tenant must have populated after refreshEnvFromCP. cp#469 (tenant
|
||||
// proxy-env delivery) — guaranteed CP-delivered creds reach the
|
||||
// tenant process env on boot. Per Researcher Task #37 / Spec 2 and
|
||||
// Task #46 (watch-fail-first test).
|
||||
//
|
||||
// Key set byte-matched against Researcher's verified emission in
|
||||
// controlplane tenant_config.go:140-144 (Researcher REQUEST_CHANGES
|
||||
// iterate body, 3987f59c). The four keys below ARE the LLM-proxy
|
||||
// subset of the 8 CP-emitted keys; OPENAI_BASE_URL / OPENAI_API_KEY /
|
||||
// ANTHROPIC_BASE_URL / ANTHROPIC_API_KEY are out of scope for cp#469
|
||||
// (different feature surfaces — direct-to-provider fallbacks, not
|
||||
// the proxy). v2 fix: MOLECULE_LLM_USAGE_TOKEN, MOLECULE_LLM_USAGE_URL,
|
||||
// MOLECULE_LLM_BASE_URL, MOLECULE_LLM_ANTHROPIC_BASE_URL — note the
|
||||
// 4th key is namespaced MOLECULE_LLM_ANTHROPIC_BASE_URL, NOT bare
|
||||
// ANTHROPIC_BASE_URL. Bare ANTHROPIC_BASE_URL is a separate CP-emitted
|
||||
// key for direct-provider use, not the LLM proxy.
|
||||
var requiredLLMEnvVars = []string{
|
||||
"MOLECULE_LLM_USAGE_TOKEN",
|
||||
"MOLECULE_LLM_USAGE_URL", // CRITICAL fix v2: was MOLECULE_LLM_URL in v1
|
||||
"MOLECULE_LLM_BASE_URL",
|
||||
"MOLECULE_LLM_ANTHROPIC_BASE_URL", // CRITICAL fix v3: was ANTHROPIC_BASE_URL in v2 (different key!)
|
||||
}
|
||||
|
||||
// assertManagedTenantHasLLMEnv verifies that, when running as a
|
||||
// managed SaaS tenant (MOLECULE_ORG_ID + ADMIN_TOKEN both set), all
|
||||
// required LLM proxy env vars are populated after refreshEnvFromCP.
|
||||
//
|
||||
// Self-hosted (no orgID/adminToken) is exempt — dev must not be
|
||||
// blocked here. Managed tenants with missing LLM keys fail with
|
||||
// MISSING_CP_LLM_ENV so they do not silently boot with broken proxy
|
||||
// creds. Caller in main.go decides whether to log and continue or
|
||||
// log.Fatalf depending on deployment context.
|
||||
func assertManagedTenantHasLLMEnv() error {
|
||||
if os.Getenv("MOLECULE_ORG_ID") == "" || os.Getenv("ADMIN_TOKEN") == "" {
|
||||
// Self-hosted dev / not yet provisioned — not a managed tenant.
|
||||
return nil
|
||||
}
|
||||
var missing []string
|
||||
for _, k := range requiredLLMEnvVars {
|
||||
if os.Getenv(k) == "" {
|
||||
missing = append(missing, k)
|
||||
}
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return fmt.Errorf("MISSING_CP_LLM_ENV: required LLM proxy keys not set after refreshEnvFromCP: %v", missing)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -60,138 +59,6 @@ func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: watch-fail-first
|
||||
// per Researcher Task #46. When running as a managed tenant
|
||||
// (MOLECULE_ORG_ID + ADMIN_TOKEN set), missing LLM proxy env vars
|
||||
// after refreshEnvFromCP MUST surface as MISSING_CP_LLM_ENV, not be
|
||||
// silently accepted. Without this guard, a CP that loses its LLM
|
||||
// creds (e.g. during an incident) would let a tenant boot and then
|
||||
// fail later at first LLM call — worse than a loud refusal here.
|
||||
func TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Stub CP returns a CP response WITHOUT any of the required
|
||||
// LLM keys — simulates the failure mode where the CP side
|
||||
// dropped or never had the LLM creds for this org.
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"x","MOLECULE_CP_URL":"https://api.moleculesai.app"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-managed-1")
|
||||
t.Setenv("ADMIN_TOKEN", "admin-tok")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
// Clear all LLM keys to simulate the boot-without-LLM-env failure mode.
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
|
||||
// refreshEnvFromCP itself should succeed — CP is reachable, returned 200.
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
// The boot assertion must catch the missing LLM keys.
|
||||
err := assertManagedTenantHasLLMEnv()
|
||||
if err == nil {
|
||||
t.Fatal("expected MISSING_CP_LLM_ENV error for managed tenant without LLM keys, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") {
|
||||
t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_ManagedTenantHappyPath: when the CP returns
|
||||
// all 4 LLM-proxy keys, the gate must PASS — no MISSING_CP_LLM_ENV
|
||||
// for a properly-configured managed tenant. Watch-fail counterpart
|
||||
// to TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: if THIS test
|
||||
// ever fires MISSING_CP_LLM_ENV on the byte-correct key set, the
|
||||
// requiredLLMEnvVars list has drifted from the CP emission again.
|
||||
// Per Researcher REQUEST_CHANGES TEST ADEQUACY note.
|
||||
func TestRefreshEnvFromCP_ManagedTenantHappyPath(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
// Return ALL 4 LLM-proxy keys — names byte-matched to
|
||||
// tenant_config.go:140-144 CP emission.
|
||||
fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com","MOLECULE_LLM_ANTHROPIC_BASE_URL":"https://llm.example.com/anthropic"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-managed-happy")
|
||||
t.Setenv("ADMIN_TOKEN", "admin-tok")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
// Pre-clear so we can verify the refresh actually populated them.
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
// Sanity: refresh actually applied the keys.
|
||||
if got := os.Getenv("MOLECULE_LLM_USAGE_TOKEN"); got != "tok-1" {
|
||||
t.Errorf("refresh did not apply USAGE_TOKEN: got %q", got)
|
||||
}
|
||||
// The boot assertion must pass — no MISSING_CP_LLM_ENV.
|
||||
if err := assertManagedTenantHasLLMEnv(); err != nil {
|
||||
t.Errorf("managed happy path must not MISSING_CP_LLM_ENV, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_ManagedTenantPartialEnv: when the CP returns
|
||||
// 3 of 4 LLM-proxy keys (one missing), the gate must STILL catch it
|
||||
// and the error must name the missing key. Per Researcher
|
||||
// REQUEST_CHANGES TEST ADEQUACY note — partial-env coverage is
|
||||
// critical because the production failure mode is usually "one
|
||||
// key dropped" not "all keys dropped".
|
||||
func TestRefreshEnvFromCP_ManagedTenantPartialEnv(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
// 3 of 4 — MOLECULE_LLM_ANTHROPIC_BASE_URL is missing.
|
||||
fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-managed-partial")
|
||||
t.Setenv("ADMIN_TOKEN", "admin-tok")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
// Pre-clear all 4 so the 3 that come back from CP are the only
|
||||
// ones set; the 4th (MOLECULE_LLM_ANTHROPIC_BASE_URL) stays empty.
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
err := assertManagedTenantHasLLMEnv()
|
||||
if err == nil {
|
||||
t.Fatal("expected MISSING_CP_LLM_ENV for partial env (3 of 4 keys), got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") {
|
||||
t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "MOLECULE_LLM_ANTHROPIC_BASE_URL") {
|
||||
t.Errorf("expected error to name the missing key MOLECULE_LLM_ANTHROPIC_BASE_URL, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop: self-hosted
|
||||
// (no orgID/adminToken) must NOT block on missing LLM keys — dev
|
||||
// ergonomics matter and the assertion's contract is "managed only".
|
||||
func TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
t.Setenv("ADMIN_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
if err := assertManagedTenantHasLLMEnv(); err != nil {
|
||||
t.Errorf("self-hosted (not managed) must not block, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must
|
||||
// return non-nil BUT main.go treats that as warn-and-continue. We assert
|
||||
// the function returns an error (not a panic) so the caller can log.
|
||||
|
||||
@@ -82,16 +82,6 @@ func main() {
|
||||
log.Printf("CP env refresh: %v (continuing with baked-in env)", err)
|
||||
}
|
||||
|
||||
// Managed-tenant boot assertion (cp#469 — tenant proxy-env delivery).
|
||||
// If we're a managed SaaS tenant (orgID + adminToken set), all required
|
||||
// LLM proxy env vars must be present after refresh. Missing keys block
|
||||
// the tenant from booting with broken LLM creds — silent-fail is worse
|
||||
// than a loud refusal. Self-hosted (no orgID/adminToken) short-circuits
|
||||
// inside the assertion, so this never fires for dev.
|
||||
if err := assertManagedTenantHasLLMEnv(); err != nil {
|
||||
log.Fatalf("Managed tenant boot assertion: %v", err)
|
||||
}
|
||||
|
||||
// Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start
|
||||
// without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5).
|
||||
// In any other environment, missing keys just log a warning and
|
||||
@@ -337,25 +327,6 @@ func main() {
|
||||
})
|
||||
}
|
||||
|
||||
// CP-mode instance-state reconciler — authoritative EC2-liveness pass
|
||||
// for SaaS workspaces (core#2261). Every other liveness sweep keys off
|
||||
// a PROXY (Redis TTL, agent heartbeat, local Docker, or
|
||||
// runtime='external'); a SaaS claude-code workspace whose EC2 was
|
||||
// terminated/stopped falls through ALL of them and stays status='online'
|
||||
// pointing at a dead instance_id forever (root cause: core#2247). This
|
||||
// loop asks the ONE authoritative question the others lack —
|
||||
// cpProv.IsRunning (CP DescribeInstances-equivalent) — for each online
|
||||
// SaaS row, and on a CLEAN "not running" feeds it into the SAME
|
||||
// onWorkspaceOffline closure the other sweeps use (status flip +
|
||||
// RestartByID reprovision, existing volume). Fail-safe: IsRunning is
|
||||
// (true, err) on any transient error, so a CP blip never flips a healthy
|
||||
// workspace.
|
||||
if cpProv != nil {
|
||||
go supervised.RunWithRecover(ctx, "cp-instance-reconciler", func(c context.Context) {
|
||||
registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, 60*time.Second)
|
||||
})
|
||||
}
|
||||
|
||||
// Pending-uploads GC sweep — deletes acked rows past their retention
|
||||
// window plus unacked rows past expires_at. Without this the
|
||||
// pending_uploads table grows unbounded; even with the 24h hard TTL,
|
||||
@@ -388,6 +359,7 @@ func main() {
|
||||
// (WorkspaceHandler.BootstrapFailed) wires its own capture inline.
|
||||
registry.BootFailureRescueHook = handlers.BootFailureRescueHook
|
||||
|
||||
|
||||
// Provision-timeout sweep — flips workspaces that have been stuck in
|
||||
// status='provisioning' past the timeout window to 'failed' and emits
|
||||
// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
|
||||
@@ -474,12 +446,12 @@ func main() {
|
||||
|
||||
// HTTP server with graceful shutdown.
|
||||
//
|
||||
// Bind host: in local dev (MOLECULE_ENV=dev|development) default the
|
||||
// listener to loopback as defense-in-depth — a dev box shouldn't be
|
||||
// reachable from the LAN. This is NOT an auth lever (auth is fail-closed
|
||||
// in every env now); it's strictly the safer default. Operators who need
|
||||
// LAN exposure set BIND_ADDR=0.0.0.0 explicitly. Production binds all
|
||||
// interfaces (existing shape). See molecule-core#7.
|
||||
// Bind host: in dev-mode (no ADMIN_TOKEN, MOLECULE_ENV=dev|development)
|
||||
// the AdminAuth chain fails open by design; pairing that with a wildcard
|
||||
// bind would expose unauth /workspaces to any same-LAN peer. Default to
|
||||
// loopback when fail-open is active. Operators who need LAN exposure set
|
||||
// BIND_ADDR=0.0.0.0 explicitly. Production (ADMIN_TOKEN set) is unchanged.
|
||||
// See molecule-core#7.
|
||||
bindHost := resolveBindHost()
|
||||
srv := &http.Server{
|
||||
Addr: fmt.Sprintf("%s:%s", bindHost, port),
|
||||
@@ -489,7 +461,7 @@ func main() {
|
||||
|
||||
// Start server in goroutine
|
||||
go func() {
|
||||
log.Printf("Platform starting on %s:%s (local-dev-env=%v)", bindHost, port, middleware.IsLocalDevEnv())
|
||||
log.Printf("Platform starting on %s:%s (dev-mode-fail-open=%v)", bindHost, port, middleware.IsDevModeFailOpen())
|
||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("Server failed: %v", err)
|
||||
}
|
||||
@@ -528,20 +500,20 @@ func envOr(key, fallback string) string {
|
||||
//
|
||||
// Precedence:
|
||||
// 1. BIND_ADDR — explicit operator override (any value, including "0.0.0.0").
|
||||
// 2. local dev (MOLECULE_ENV=dev|development) → "127.0.0.1" (loopback only).
|
||||
// 2. dev-mode fail-open active → "127.0.0.1" (loopback only).
|
||||
// 3. otherwise → "" (Go binds every interface; existing prod/self-host shape).
|
||||
//
|
||||
// NOTE (harden/no-fail-open-auth): this is a defense-in-depth default, NOT an
|
||||
// auth lever. Auth is fail-closed in every environment now, so the loopback
|
||||
// default no longer compensates for a weak auth chain — it simply keeps a dev
|
||||
// box off the LAN by default. It is keyed on MOLECULE_ENV alone (decoupled
|
||||
// from ADMIN_TOKEN), because dev now provisions an ADMIN_TOKEN yet should
|
||||
// still default to loopback. See molecule-core#7 for the original LAN finding.
|
||||
// Coupling the loopback default to middleware.IsDevModeFailOpen() means the
|
||||
// two safety levers — bind narrowness and auth strength — move together. A
|
||||
// production deploy (ADMIN_TOKEN set) keeps binding to all interfaces because
|
||||
// the auth chain is doing its job; a dev Mac (no ADMIN_TOKEN, MOLECULE_ENV=dev)
|
||||
// is reachable only via loopback because the auth chain is fail-open. See
|
||||
// molecule-core#7 for the original LAN exposure finding.
|
||||
func resolveBindHost() string {
|
||||
if v := os.Getenv("BIND_ADDR"); v != "" {
|
||||
return v
|
||||
}
|
||||
if middleware.IsLocalDevEnv() {
|
||||
if middleware.IsDevModeFailOpen() {
|
||||
return "127.0.0.1"
|
||||
}
|
||||
return ""
|
||||
|
||||
@@ -149,11 +149,9 @@ func markFailed(ctx context.Context, wsID string, broadcaster *events.Broadcaste
|
||||
models.StatusFailed, msg, wsID); dbErr != nil {
|
||||
log.Printf("bundle import: failed to mark workspace %s as failed: %v", wsID, dbErr)
|
||||
}
|
||||
if bcErr := broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), wsID, map[string]interface{}{
|
||||
broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), wsID, map[string]interface{}{
|
||||
"error": msg,
|
||||
}); bcErr != nil {
|
||||
log.Printf("bundle import: failed to broadcast provision failed for %s: %v", wsID, bcErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func nilIfEmpty(s string) interface{} {
|
||||
|
||||
@@ -407,14 +407,12 @@ func (m *Manager) HandleInbound(ctx context.Context, ch ChannelRow, msg *Inbound
|
||||
|
||||
// Broadcast event
|
||||
if m.broadcaster != nil {
|
||||
if err := m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
"channel_id": ch.ID,
|
||||
"channel_type": ch.ChannelType,
|
||||
"username": msg.Username,
|
||||
"direction": "inbound",
|
||||
}); err != nil {
|
||||
log.Printf("Channels: failed to broadcast inbound event: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -455,13 +453,11 @@ func (m *Manager) SendOutbound(ctx context.Context, channelID string, text strin
|
||||
}
|
||||
|
||||
if m.broadcaster != nil {
|
||||
if err := m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
"channel_id": ch.ID,
|
||||
"channel_type": ch.ChannelType,
|
||||
"direction": "outbound",
|
||||
}); err != nil {
|
||||
log.Printf("Channels: failed to broadcast outbound event: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -517,9 +517,7 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in
|
||||
|
||||
// Acknowledge the button press (removes loading spinner)
|
||||
ackCfg := tgbotapi.NewCallback(cb.ID, "Received")
|
||||
if _, err := bot.Send(ackCfg); err != nil {
|
||||
log.Printf("telegram: failed to send callback ack: %v", err)
|
||||
}
|
||||
bot.Send(ackCfg)
|
||||
|
||||
// Update the message to show what was clicked
|
||||
decision := "approved"
|
||||
@@ -531,9 +529,7 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in
|
||||
cb.Message.MessageID,
|
||||
cb.Message.Text+"\n\n✅ CEO "+decision,
|
||||
)
|
||||
if _, err := bot.Send(editMsg); err != nil {
|
||||
log.Printf("telegram: failed to send edit message: %v", err)
|
||||
}
|
||||
bot.Send(editMsg)
|
||||
|
||||
// Route the decision as an inbound message to the agent
|
||||
inbound := &InboundMessage{
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
package handlers
|
||||
|
||||
// a2a_outbound_envelope_test.go — outbound A2A `message/send` envelope
|
||||
// CONTRACT gate (issue #2251).
|
||||
//
|
||||
// #2251: an outbound A2A envelope shipped without `role` and with text
|
||||
// parts keyed `type` instead of the v0.3-canonical `kind`. The receiver's
|
||||
// a-2-a-sdk v0.3 Pydantic validator silently rejected the message
|
||||
// post-dispatch — the sender saw a happy 200/202 while the brief was
|
||||
// dropped (the same invisible-rejection failure class as the v0.2→v0.3
|
||||
// content bug pinned by a2a_corpus_test.go, but on the SEND side).
|
||||
//
|
||||
// The inbound corpus replay (a2a_corpus_test.go) proves normalizeA2APayload
|
||||
// produces `parts[].kind` + a non-empty messageId, but it does NOT assert
|
||||
// `role`, and it only covers what we RECEIVE. Nothing pins what core
|
||||
// EMITS. This file pins the emit contract at the helper that builds the
|
||||
// parts (buildA2AMessageParts, used by both delegate_task and
|
||||
// delegate_task_async) and asserts the canonical Part key is `kind`.
|
||||
//
|
||||
// Part-object schema (A2A v0.3): every Part MUST carry a `kind`
|
||||
// discriminator ("text" | "file" | "data"); there is NO `type` key. A
|
||||
// text Part is {"kind":"text","text":"..."}. Emitting `type` makes the
|
||||
// v0.3 validator drop the Part.
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestBuildA2AMessageParts_TextPartUsesKindNotType pins the v0.3 Part
|
||||
// discriminator for the text part emitted on every outbound A2A
|
||||
// delegation. RED before #2251's fix (the helper emitted
|
||||
// {"type":"text",...}); the receiver's v0.3 Pydantic validator drops a
|
||||
// Part keyed `type`, silently losing the task text.
|
||||
func TestBuildA2AMessageParts_TextPartUsesKindNotType(t *testing.T) {
|
||||
parts := buildA2AMessageParts("do the work", nil)
|
||||
if len(parts) == 0 {
|
||||
t.Fatal("buildA2AMessageParts returned no parts for a non-empty task")
|
||||
}
|
||||
text := parts[0]
|
||||
|
||||
if _, hasType := text["type"]; hasType {
|
||||
t.Errorf("text part uses forbidden v0.2 key `type` %v — A2A v0.3 Parts discriminate on `kind`; `type` is dropped by the receiver's validator (#2251)", text)
|
||||
}
|
||||
kind, ok := text["kind"].(string)
|
||||
if !ok {
|
||||
t.Fatalf("text part missing string `kind` discriminator; got %v", text)
|
||||
}
|
||||
if kind != "text" {
|
||||
t.Errorf("text part kind = %q, want \"text\"", kind)
|
||||
}
|
||||
if text["text"] != "do the work" {
|
||||
t.Errorf("text part text = %v, want \"do the work\"", text["text"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildA2AMessageParts_FilePartUsesKind guards the file-attachment
|
||||
// Part the same way. The file path was already correct (it used `kind`),
|
||||
// so this is a non-regression pin — it must STAY `kind` when the text
|
||||
// path is fixed (a careless "make them consistent" edit could flip both
|
||||
// to the wrong key).
|
||||
func TestBuildA2AMessageParts_FilePartUsesKind(t *testing.T) {
|
||||
atts := []AgentMessageAttachment{
|
||||
{URI: "https://example.com/a.png", MimeType: "image/png", Name: "a.png"},
|
||||
}
|
||||
parts := buildA2AMessageParts("caption", atts)
|
||||
if len(parts) < 2 {
|
||||
t.Fatalf("expected text + file parts, got %d", len(parts))
|
||||
}
|
||||
file := parts[1]
|
||||
if _, hasType := file["type"]; hasType {
|
||||
t.Errorf("file part uses forbidden `type` key: %v", file)
|
||||
}
|
||||
if _, hasKind := file["kind"]; !hasKind {
|
||||
t.Errorf("file part missing `kind` discriminator: %v", file)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDelegationOutboundEnvelope_RoleAndKind pins the FULL outbound
|
||||
// envelope contract — role + parts[].kind — on the canonical helper.
|
||||
// A v0.3 `message` MUST carry `role` ("user" for a delegation request)
|
||||
// and `parts` whose every entry discriminates on `kind`. This is the
|
||||
// shape the receiver's MessageSendParams validator accepts; an envelope
|
||||
// missing `role` or keyed `type` is silently rejected (#2251).
|
||||
//
|
||||
// Built from the same primitives delegation.go / mcp_tools.go assemble
|
||||
// (role:"user" + buildA2AMessageParts) so the round-trip through
|
||||
// json.Marshal proves the wire bytes are v0.3-valid.
|
||||
func TestDelegationOutboundEnvelope_RoleAndKind(t *testing.T) {
|
||||
envelope := map[string]interface{}{
|
||||
"method": "message/send",
|
||||
"params": map[string]interface{}{
|
||||
"message": map[string]interface{}{
|
||||
"role": "user",
|
||||
"messageId": "deleg-1",
|
||||
"parts": buildA2AMessageParts("do the work", nil),
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
var parsed map[string]interface{}
|
||||
if err := json.Unmarshal(raw, &parsed); err != nil {
|
||||
t.Fatalf("unmarshal envelope: %v", err)
|
||||
}
|
||||
|
||||
params, _ := parsed["params"].(map[string]interface{})
|
||||
if params == nil {
|
||||
t.Fatal("envelope missing params")
|
||||
}
|
||||
msg, _ := params["message"].(map[string]interface{})
|
||||
if msg == nil {
|
||||
t.Fatal("envelope missing params.message")
|
||||
}
|
||||
|
||||
// role is mandatory on a v0.3 message — the receiver rejects without it.
|
||||
role, hasRole := msg["role"].(string)
|
||||
if !hasRole || role == "" {
|
||||
t.Errorf("params.message missing non-empty `role` — v0.3 requires it; omitting it is the other half of #2251")
|
||||
}
|
||||
|
||||
parts, _ := msg["parts"].([]interface{})
|
||||
if len(parts) == 0 {
|
||||
t.Fatal("params.message.parts is empty")
|
||||
}
|
||||
for i, p := range parts {
|
||||
pm, _ := p.(map[string]interface{})
|
||||
if pm == nil {
|
||||
t.Errorf("part %d is not an object: %v", i, p)
|
||||
continue
|
||||
}
|
||||
if _, hasType := pm["type"]; hasType {
|
||||
t.Errorf("part %d uses forbidden `type` key (must be `kind`): %v", i, pm)
|
||||
}
|
||||
if _, hasKind := pm["kind"]; !hasKind {
|
||||
t.Errorf("part %d missing `kind` discriminator: %v", i, pm)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -801,18 +801,6 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
|
||||
if _, hasID := msg["messageId"]; !hasID {
|
||||
msg["messageId"] = uuid.New().String()
|
||||
}
|
||||
// #2251: default params.message.role to "user" when absent.
|
||||
// The downstream a2a-sdk v0.3 Pydantic validator marks role a
|
||||
// REQUIRED field; a role-less envelope fails parse with
|
||||
// "params.message.role Field required". The Go builders
|
||||
// (mcp_tools/delegation/scheduler/channels) already set it, but
|
||||
// raw external/canvas POSTs to ProxyA2A may omit it — making this
|
||||
// the single canonical choke that guarantees a schema-valid role.
|
||||
// Mirror the messageId default exactly: inject only when missing,
|
||||
// never overwrite a caller-supplied role (e.g. "agent").
|
||||
if _, hasRole := msg["role"]; !hasRole {
|
||||
msg["role"] = "user"
|
||||
}
|
||||
_, hasParts := msg["parts"]
|
||||
rawContent, hasContent := msg["content"]
|
||||
if !hasParts {
|
||||
@@ -844,27 +832,6 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// #2251: wire hygiene — the A2A v0.3 Part discriminator is
|
||||
// "kind", but some builders/clients emit the legacy "type" key
|
||||
// (e.g. delegation.go). The v0.3 Pydantic validator keys on
|
||||
// "kind"; a stray "type" leaves the Part untagged. Rename
|
||||
// "type" → "kind" on every Part that lacks an explicit "kind"
|
||||
// so the discriminator is always present on the wire.
|
||||
if parts, ok := msg["parts"].([]interface{}); ok {
|
||||
for _, p := range parts {
|
||||
part, ok := p.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if _, hasKind := part["kind"]; hasKind {
|
||||
continue
|
||||
}
|
||||
if t, hasType := part["type"]; hasType {
|
||||
part["kind"] = t
|
||||
delete(part, "type")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -383,48 +383,23 @@ func (h *WorkspaceHandler) logA2ASuccess(ctx context.Context, workspaceID, calle
|
||||
}
|
||||
summary := a2aMethod + " → " + wsNameForLog
|
||||
toolTrace := extractToolTrace(respBody)
|
||||
|
||||
// DATA-LOSS FIX (internal#470 / #1347 push-mode sibling): this
|
||||
// a2a_receive row is the ONLY durable record of a push-mode chat
|
||||
// round-trip — request_body carries the user's message, response_body
|
||||
// carries the agent's reply, and chat-history hydration
|
||||
// (messagestore.PostgresMessageStore) reads BOTH back to rebuild the
|
||||
// transcript on canvas reopen / reload. It MUST be written
|
||||
// SYNCHRONOUSLY, before proxyA2ARequest returns and ProxyA2A flushes
|
||||
// the 200 to the canvas — otherwise the canvas sees the reply
|
||||
// acknowledged (and rendered optimistically) while the row is still
|
||||
// racing in a detached goroutine, and a reload (or a workspace-server
|
||||
// restart / deploy / OOM) between the 200 and the goroutine's commit
|
||||
// loses the message permanently on reopen.
|
||||
//
|
||||
// This mirrors the discipline already applied to the poll-mode ingest
|
||||
// path (logA2AReceiveQueued / persistUserMessageAtIngest); the
|
||||
// push-mode counterpart was left async, which the E2E Chat
|
||||
// "history persists across reload" test surfaced as an intermittent
|
||||
// red (the reload out-raced the INSERT).
|
||||
//
|
||||
// - context.WithoutCancel: a client disconnect on chat-exit (which
|
||||
// cancels the inbound request ctx) MUST NOT abort this write.
|
||||
// - SYNCHRONOUS (no goAsync): the row must be durable before the 200.
|
||||
// - Best-effort: LogActivity logs+swallows INSERT errors internally,
|
||||
// so a DB hiccup never blocks or fails the user's send — behaviour
|
||||
// for that one request is never worse than the pre-fix async path.
|
||||
// - The post-commit ACTIVITY_LOGGED broadcast still fires inside
|
||||
// LogActivity; the durable row is the truth the canvas re-reads.
|
||||
logCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
|
||||
defer cancel()
|
||||
LogActivity(logCtx, h.broadcaster, ActivityParams{
|
||||
WorkspaceID: workspaceID,
|
||||
ActivityType: "a2a_receive",
|
||||
SourceID: nilIfEmpty(callerID),
|
||||
TargetID: &workspaceID,
|
||||
Method: &a2aMethod,
|
||||
Summary: &summary,
|
||||
RequestBody: json.RawMessage(body),
|
||||
ResponseBody: json.RawMessage(respBody),
|
||||
ToolTrace: toolTrace,
|
||||
DurationMs: &durationMs,
|
||||
Status: logStatus,
|
||||
parent := ctx
|
||||
h.goAsync(func() {
|
||||
logCtx, cancel := context.WithTimeout(context.WithoutCancel(parent), 30*time.Second)
|
||||
defer cancel()
|
||||
LogActivity(logCtx, h.broadcaster, ActivityParams{
|
||||
WorkspaceID: workspaceID,
|
||||
ActivityType: "a2a_receive",
|
||||
SourceID: nilIfEmpty(callerID),
|
||||
TargetID: &workspaceID,
|
||||
Method: &a2aMethod,
|
||||
Summary: &summary,
|
||||
RequestBody: json.RawMessage(body),
|
||||
ResponseBody: json.RawMessage(respBody),
|
||||
ToolTrace: toolTrace,
|
||||
DurationMs: &durationMs,
|
||||
Status: logStatus,
|
||||
})
|
||||
})
|
||||
|
||||
if callerID == "" && statusCode < 400 {
|
||||
|
||||
@@ -1514,142 +1514,6 @@ func TestNormalizeA2APayload_NoMessageNoCheck(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// --- #2251: role default + part-kind hygiene contract tests ---
|
||||
//
|
||||
// These assert normalizeA2APayload is the single canonical Go choke that
|
||||
// guarantees a schema-valid outbound message/send envelope: it injects a
|
||||
// default params.message.role="user" when the sender omitted role (the bug
|
||||
// that made delegate_task fail the peer's a2a Pydantic validator with
|
||||
// "params.message.role Field required" while reply_to_workspace worked), and
|
||||
// it renames the legacy Part discriminator "type"→"kind" for wire hygiene.
|
||||
|
||||
// normMsg is a small helper that runs normalizeA2APayload and returns the
|
||||
// resolved params.message map, failing the test on any normalization error.
|
||||
func normMsg(t *testing.T, raw string) map[string]interface{} {
|
||||
t.Helper()
|
||||
out, _, perr := normalizeA2APayload([]byte(raw))
|
||||
if perr != nil {
|
||||
t.Fatalf("normalizeA2APayload returned error: %+v", perr)
|
||||
}
|
||||
var parsed map[string]interface{}
|
||||
if err := json.Unmarshal(out, &parsed); err != nil {
|
||||
t.Fatalf("output not valid JSON: %v", err)
|
||||
}
|
||||
params, ok := parsed["params"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("output missing params object: %s", string(out))
|
||||
}
|
||||
msg, ok := params["message"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("output missing params.message object: %s", string(out))
|
||||
}
|
||||
return msg
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_DefaultsRoleWhenMissing(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
}{
|
||||
{
|
||||
name: "v0.3 parts, no role",
|
||||
raw: `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
|
||||
},
|
||||
{
|
||||
name: "v0.2 string content, no role",
|
||||
raw: `{"method":"message/send","params":{"message":{"content":"hi"}}}`,
|
||||
},
|
||||
{
|
||||
name: "legacy type part, no role",
|
||||
raw: `{"method":"message/send","params":{"message":{"parts":[{"type":"text","text":"hi"}]}}}`,
|
||||
},
|
||||
{
|
||||
name: "already wrapped jsonrpc, no role",
|
||||
raw: `{"jsonrpc":"2.0","id":"x","method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
msg := normMsg(t, tc.raw)
|
||||
if msg["role"] != "user" {
|
||||
t.Errorf("expected role defaulted to \"user\", got %v", msg["role"])
|
||||
}
|
||||
// Parts must remain valid (non-empty) after normalization.
|
||||
parts, ok := msg["parts"].([]interface{})
|
||||
if !ok || len(parts) == 0 {
|
||||
t.Fatalf("expected non-empty parts after normalization, got %v", msg["parts"])
|
||||
}
|
||||
// Every part must carry the v0.3 "kind" discriminator.
|
||||
for i, p := range parts {
|
||||
part, ok := p.(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("part %d is not an object: %v", i, p)
|
||||
}
|
||||
if _, hasKind := part["kind"]; !hasKind {
|
||||
t.Errorf("part %d missing \"kind\" discriminator: %v", i, part)
|
||||
}
|
||||
if _, hasType := part["type"]; hasType {
|
||||
t.Errorf("part %d still has legacy \"type\" key: %v", i, part)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_PreservesExplicitRole(t *testing.T) {
|
||||
// A caller-supplied role (e.g. "agent") must NOT be overwritten with "user".
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"agent","parts":[{"kind":"text","text":"hi"}]}}}`)
|
||||
if msg["role"] != "agent" {
|
||||
t.Errorf("explicit role overwritten: expected \"agent\", got %v", msg["role"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_RenamesPartTypeToKind(t *testing.T) {
|
||||
// Mirrors delegation.go's builder which emits {"type":"text",...}. After
|
||||
// normalization the wire Part must be discriminated by "kind".
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"type":"text","text":"a"},{"type":"file","uri":"workspace:/x"}]}}}`)
|
||||
parts := msg["parts"].([]interface{})
|
||||
if len(parts) != 2 {
|
||||
t.Fatalf("expected 2 parts, got %d", len(parts))
|
||||
}
|
||||
wantKind := []string{"text", "file"}
|
||||
for i, p := range parts {
|
||||
part := p.(map[string]interface{})
|
||||
if part["kind"] != wantKind[i] {
|
||||
t.Errorf("part %d: expected kind=%q, got %v", i, wantKind[i], part["kind"])
|
||||
}
|
||||
if _, hasType := part["type"]; hasType {
|
||||
t.Errorf("part %d still carries legacy \"type\": %v", i, part)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_DoesNotClobberKindWithType(t *testing.T) {
|
||||
// If a part has BOTH kind and type, kind wins and is left untouched.
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"kind":"text","type":"ignored","text":"a"}]}}}`)
|
||||
part := msg["parts"].([]interface{})[0].(map[string]interface{})
|
||||
if part["kind"] != "text" {
|
||||
t.Errorf("expected kind preserved as \"text\", got %v", part["kind"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestNormalizeA2APayload_RoleDefault_ContractRegression documents the
|
||||
// pre-fix failure: without the role default, a role-less message/send body
|
||||
// emerged from normalization still missing params.message.role, which the
|
||||
// peer's a2a Pydantic validator rejects. This asserts the POST-fix invariant
|
||||
// (role present) directly; before the a2a_proxy.go change this assertion
|
||||
// fails (role is absent → msg["role"] == nil).
|
||||
func TestNormalizeA2APayload_RoleDefault_ContractRegression(t *testing.T) {
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"delegate this"}]}}}`)
|
||||
role, hasRole := msg["role"]
|
||||
if !hasRole {
|
||||
t.Fatal("REGRESSION (#2251): params.message.role absent after normalization — peer a2a validator will reject with 'role Field required'")
|
||||
}
|
||||
if role != "user" {
|
||||
t.Errorf("expected default role \"user\", got %v", role)
|
||||
}
|
||||
}
|
||||
|
||||
// --- resolveAgentURL direct unit tests ---
|
||||
|
||||
func TestResolveAgentURL_CacheHit(t *testing.T) {
|
||||
|
||||
@@ -246,20 +246,6 @@ func MarkQueueItemFailed(ctx context.Context, id, errMsg string) {
|
||||
}
|
||||
}
|
||||
|
||||
// QueueDepth returns the number of currently-queued (not dispatched/completed)
|
||||
// items for a workspace. Used by the busy-return response body so callers
|
||||
// can see how many ahead of them.
|
||||
func QueueDepth(ctx context.Context, workspaceID string) int {
|
||||
var n int
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT COUNT(*) FROM a2a_queue WHERE workspace_id = $1 AND status = 'queued'`,
|
||||
workspaceID,
|
||||
).Scan(&n); err != nil {
|
||||
log.Printf("A2AQueue: QueueDepth query failed for workspace %s: %v", workspaceID, err)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// DropStaleQueueItems marks queued items older than maxAge as 'dropped' with a
|
||||
// system-generated reason so PM agents stop processing stale post-incident noise.
|
||||
// Called with a workspaceID to scope cleanup to one workspace, or empty to sweep
|
||||
|
||||
@@ -60,10 +60,10 @@ func sanitizeErrorDetailForBroadcast(s string) string {
|
||||
}
|
||||
|
||||
type ActivityHandler struct {
|
||||
broadcaster events.EventEmitter
|
||||
broadcaster *events.Broadcaster
|
||||
}
|
||||
|
||||
func NewActivityHandler(b events.EventEmitter) *ActivityHandler {
|
||||
func NewActivityHandler(b *events.Broadcaster) *ActivityHandler {
|
||||
return &ActivityHandler{broadcaster: b}
|
||||
}
|
||||
|
||||
@@ -152,7 +152,7 @@ func extractAttachmentsFromMessageParts(body map[string]interface{}) []map[strin
|
||||
if kind == "" {
|
||||
kind, _ = part["type"].(string)
|
||||
}
|
||||
if kind != "file" && kind != "image" && kind != "audio" && kind != "video" {
|
||||
if kind != "file" && kind != "image" && kind != "audio" {
|
||||
continue
|
||||
}
|
||||
// The file sub-object holds uri/mime_type/name. The a2a-sdk v1
|
||||
@@ -380,18 +380,12 @@ func (h *ActivityHandler) List(c *gin.Context) {
|
||||
// "row not found" — both indicate the cursor is no longer usable for
|
||||
// this caller, no information leak.
|
||||
var cursorTime time.Time
|
||||
var cursorSeq int64
|
||||
usingCursor := false
|
||||
if sinceID != "" {
|
||||
// Resolve BOTH ordering-key components of the cursor row. The feed is
|
||||
// ordered by (created_at, seq), so the strictly-after filter below must
|
||||
// compare the full tuple — comparing created_at alone silently drops a
|
||||
// row written in the SAME microsecond as the cursor row (the boundary
|
||||
// skip the since_id E2E intermittently tripped over).
|
||||
err := db.DB.QueryRowContext(c.Request.Context(),
|
||||
`SELECT created_at, seq FROM activity_logs WHERE id = $1 AND workspace_id = $2`,
|
||||
`SELECT created_at FROM activity_logs WHERE id = $1 AND workspace_id = $2`,
|
||||
sinceID, workspaceID,
|
||||
).Scan(&cursorTime, &cursorSeq)
|
||||
).Scan(&cursorTime)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
c.JSON(http.StatusGone, gin.H{
|
||||
"error": "since_id cursor not found (row may have been pruned or belongs to a different workspace); omit since_id to reset",
|
||||
@@ -498,20 +492,10 @@ func (h *ActivityHandler) List(c *gin.Context) {
|
||||
argIdx++
|
||||
}
|
||||
if usingCursor {
|
||||
// Strictly after the cursor on the FULL ordering key (created_at, seq).
|
||||
// Tuple comparison: a row is "after" the cursor if its created_at is
|
||||
// later, OR it shares the cursor's created_at but has a higher seq.
|
||||
// This (a) never replays the cursor row itself and (b) — unlike a bare
|
||||
// `created_at > cursor` — never drops a row written in the same
|
||||
// microsecond as the cursor row. Expressed as the expanded boolean
|
||||
// rather than a row-value `(created_at, seq) > ($t, $s)` so it composes
|
||||
// with the actCol qualifier prefix and the existing placeholder/arg
|
||||
// builder cleanly.
|
||||
query += fmt.Sprintf(
|
||||
" AND ("+actCol+"created_at > $%d OR ("+actCol+"created_at = $%d AND "+actCol+"seq > $%d))",
|
||||
argIdx, argIdx, argIdx+1)
|
||||
args = append(args, cursorTime, cursorSeq)
|
||||
argIdx += 2
|
||||
// Strictly after — never replay the cursor row itself.
|
||||
query += fmt.Sprintf(" AND "+actCol+"created_at > $%d", argIdx)
|
||||
args = append(args, cursorTime)
|
||||
argIdx++
|
||||
}
|
||||
|
||||
// Polling clients (since_id) need oldest-first within the new window so
|
||||
@@ -519,13 +503,9 @@ func (h *ActivityHandler) List(c *gin.Context) {
|
||||
// since_id) keeps DESC — that's the canvas/UI shape and changing it
|
||||
// would surprise existing callers.
|
||||
if usingCursor {
|
||||
// (created_at, seq) ASC — seq is the deterministic tiebreaker for rows
|
||||
// sharing a microsecond-collided created_at. Replays in recorded order.
|
||||
query += fmt.Sprintf(" ORDER BY "+actCol+"created_at ASC, "+actCol+"seq ASC LIMIT $%d", argIdx)
|
||||
query += fmt.Sprintf(" ORDER BY "+actCol+"created_at ASC LIMIT $%d", argIdx)
|
||||
} else {
|
||||
// (created_at, seq) DESC — same tiebreaker, newest-first for the
|
||||
// canvas/recent-feed shape.
|
||||
query += fmt.Sprintf(" ORDER BY "+actCol+"created_at DESC, "+actCol+"seq DESC LIMIT $%d", argIdx)
|
||||
query += fmt.Sprintf(" ORDER BY "+actCol+"created_at DESC LIMIT $%d", argIdx)
|
||||
}
|
||||
args = append(args, limit)
|
||||
|
||||
@@ -700,8 +680,7 @@ func buildSessionSearchQuery(workspaceID, query string, limit int) (string, []in
|
||||
COALESCE(status, '') AS status,
|
||||
request_body,
|
||||
response_body,
|
||||
created_at,
|
||||
seq
|
||||
created_at
|
||||
FROM activity_logs
|
||||
WHERE workspace_id = $1
|
||||
)
|
||||
@@ -723,13 +702,7 @@ func buildSessionSearchQuery(workspaceID, query string, limit int) (string, []in
|
||||
args = append(args, "%"+query+"%")
|
||||
}
|
||||
|
||||
// Deterministic order: created_at alone is not unique (same-microsecond
|
||||
// rows), so tie-break on the monotonic seq — same fix as the since_id feed
|
||||
// (§ No flakes: no unstable sorts, even on an unused surface). `seq` is
|
||||
// projected through the session_items CTE above so this outer ORDER BY can
|
||||
// reference it — the outer SELECT can only sort on the CTE's output columns,
|
||||
// not on activity_logs directly.
|
||||
sqlQuery += ` ORDER BY created_at DESC, seq DESC LIMIT $` + strconv.Itoa(len(args)+1)
|
||||
sqlQuery += ` ORDER BY created_at DESC LIMIT $` + strconv.Itoa(len(args)+1)
|
||||
args = append(args, limit)
|
||||
return sqlQuery, args
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -118,23 +118,6 @@ func TestExtractAttachmentsFromRequestBody_ImageAndAudio(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractAttachmentsFromRequestBody_VideoPart(t *testing.T) {
|
||||
// Video parts are accepted in message-parts envelope (issue #2222).
|
||||
body := []byte(`{"jsonrpc":"2.0","method":"message/send","params":{"message":{"parts":[
|
||||
{"kind":"video","file":{"uri":"workspace:clip.mp4","mime_type":"video/mp4","name":"clip.mp4"}}
|
||||
]}}}`)
|
||||
atts := extractAttachmentsFromRequestBody(body)
|
||||
if len(atts) != 1 {
|
||||
t.Fatalf("want 1 attachment, got %d", len(atts))
|
||||
}
|
||||
if atts[0]["kind"] != "video" {
|
||||
t.Errorf("kind: want video, got %v", atts[0]["kind"])
|
||||
}
|
||||
if atts[0]["uri"] != "workspace:clip.mp4" {
|
||||
t.Errorf("uri mismatch: %v", atts[0]["uri"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractAttachmentsFromRequestBody_LegacyV0TypeDiscriminator(t *testing.T) {
|
||||
// Legacy v0 shape: type=file (not kind), inlined fields (no nested .file)
|
||||
body := []byte(`{"jsonrpc":"2.0","method":"message/send","params":{"message":{"parts":[
|
||||
|
||||
@@ -1,211 +0,0 @@
|
||||
//go:build integration
|
||||
// +build integration
|
||||
|
||||
// activity_seq_backfill_integration_test.go — REAL Postgres proof of the
|
||||
// invariant the 20260604000000_activity_logs_seq.up.sql migration guarantees:
|
||||
// every activity_logs row carries a NON-NULL `seq`, both for rows that existed
|
||||
// before the migration ran (assigned during the ALTER TABLE rewrite) and for
|
||||
// rows created afterward via the normal INSERT path (assigned by the IDENTITY
|
||||
// default). This is the coverage CR2 (#2339 review) correctly flagged as
|
||||
// missing on PR #2258.
|
||||
//
|
||||
// WHY THIS IS A SEPARATE TEST from activity_since_id_ordering_integration_test.go:
|
||||
// that test pins the *ordering* contract (same-microsecond rows come back in a
|
||||
// deterministic (created_at, seq) order). THIS test pins the *backfill* contract
|
||||
// — that `seq` is never NULL — and the consequence the reviewer doubted: a
|
||||
// pre-existing/backfilled row is usable as a since_id cursor because its seq is
|
||||
// non-null, so the tuple cursor `(created_at, seq)` the handler builds is well
|
||||
// defined for it.
|
||||
//
|
||||
// EMPIRICAL BASIS (PostgreSQL 16.13, the prod PG version):
|
||||
// - `ALTER TABLE activity_logs ADD COLUMN seq BIGINT GENERATED BY DEFAULT AS
|
||||
// IDENTITY` rewrites the table and assigns seq to EXISTING rows in physical
|
||||
// table-scan order — they are NON-NULL, not left NULL as the review claimed.
|
||||
// - The identity sequence then advances ABOVE max(seq), so the next INSERT
|
||||
// that omits seq gets max+1 with no collision.
|
||||
// Run against any Postgres 15/16 the integration harness boots — the property
|
||||
// holds on both.
|
||||
//
|
||||
// Run with (same harness as activity_delegation_a2a_integration_test.go):
|
||||
//
|
||||
// docker run --rm -d --name pg-integration \
|
||||
// -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
|
||||
// -p 55432:5432 postgres:15-alpine
|
||||
// sleep 4
|
||||
// # apply migrations (incl. 20260604000000_activity_logs_seq.up.sql) then:
|
||||
// INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
|
||||
// go test -tags=integration ./internal/handlers/ -run Integration_ActivityLogs_Seq
|
||||
//
|
||||
// WATCH-IT-FAIL: if `seq` were left nullable / un-backfilled (the failure mode
|
||||
// the reviewer hypothesized), the NULL-count assertion in _NoNull trips, and
|
||||
// the since_id-on-a-backfilled-row case in _SinceIDOnBackfilledRow trips because
|
||||
// the handler cannot read a non-null seq for the cursor row. With the migration
|
||||
// as written both are green every run.
|
||||
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// TestIntegration_ActivityLogs_SeqBackfill_NoNull pins the core migration
|
||||
// invariant: AFTER migrations have run, NO activity_logs row may have a NULL
|
||||
// seq — neither rows that the seedActivityRowAt path inserts (IDENTITY default)
|
||||
// nor any row the schema carries. It also proves the IDENTITY sequence keeps
|
||||
// producing distinct, non-null seq for fresh inserts (no collision, no NULL).
|
||||
//
|
||||
// This is the assertion that would FAIL if the ALTER had left existing rows
|
||||
// with NULL seq (the reviewer's claim) — table-scan backfill makes it pass.
|
||||
func TestIntegration_ActivityLogs_SeqBackfill_NoNull(t *testing.T) {
|
||||
conn := integrationDB_ActivityDelegationA2A(t)
|
||||
_ = conn
|
||||
wsID := seedWorkspace(t, conn, "test-2151-seq-backfill-nonull")
|
||||
|
||||
// Insert several rows via the normal path. seq is left to the IDENTITY
|
||||
// default — exactly how production writes activity_logs.
|
||||
t0 := time.Date(2026, 6, 4, 9, 0, 0, 0, time.UTC)
|
||||
const n = 5
|
||||
ids := make([]string, 0, n)
|
||||
for i := 0; i < n; i++ {
|
||||
ids = append(ids, seedActivityRowAt(t, wsID, "backfill-row", t0.Add(time.Duration(i)*time.Second)))
|
||||
}
|
||||
|
||||
// (a) No row in this workspace may have a NULL seq. If the column were
|
||||
// un-backfilled / nullable this is > 0 and the test fails.
|
||||
var nullCount int
|
||||
if err := db.DB.QueryRowContext(context.Background(),
|
||||
`SELECT COUNT(*) FROM activity_logs WHERE workspace_id = $1 AND seq IS NULL`,
|
||||
wsID,
|
||||
).Scan(&nullCount); err != nil {
|
||||
t.Fatalf("null-seq count query: %v", err)
|
||||
}
|
||||
if nullCount != 0 {
|
||||
t.Fatalf("found %d activity_logs rows with NULL seq — migration did NOT backfill/assign seq", nullCount)
|
||||
}
|
||||
|
||||
// Belt-and-suspenders: the GLOBAL invariant (no NULL seq anywhere in the
|
||||
// table) is what the migration actually guarantees. Assert it too, so a
|
||||
// regression that nulls seq for rows written by some other path is caught.
|
||||
var globalNull int
|
||||
if err := db.DB.QueryRowContext(context.Background(),
|
||||
`SELECT COUNT(*) FROM activity_logs WHERE seq IS NULL`,
|
||||
).Scan(&globalNull); err != nil {
|
||||
t.Fatalf("global null-seq count query: %v", err)
|
||||
}
|
||||
if globalNull != 0 {
|
||||
t.Fatalf("found %d activity_logs rows table-wide with NULL seq — seq must be non-null for every row", globalNull)
|
||||
}
|
||||
|
||||
// (b) The IDENTITY sequence yields DISTINCT, monotonic, non-null seq for
|
||||
// the rows we just inserted (proves the normal insert path gets a real seq,
|
||||
// and that the sequence advanced past any backfilled max instead of
|
||||
// colliding). We read them back in insert order and require strictly
|
||||
// increasing, all-non-null seq.
|
||||
rows, err := db.DB.QueryContext(context.Background(),
|
||||
`SELECT seq FROM activity_logs WHERE workspace_id = $1 ORDER BY created_at ASC, seq ASC`,
|
||||
wsID,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("read-back seq query: %v", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
var seqs []int64
|
||||
for rows.Next() {
|
||||
var s *int64 // pointer so a NULL would scan as nil rather than 0
|
||||
if err := rows.Scan(&s); err != nil {
|
||||
t.Fatalf("scan seq: %v", err)
|
||||
}
|
||||
if s == nil {
|
||||
t.Fatal("a freshly-inserted activity_logs row has NULL seq — IDENTITY default did not fire")
|
||||
}
|
||||
seqs = append(seqs, *s)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
t.Fatalf("rows err: %v", err)
|
||||
}
|
||||
if len(seqs) != n {
|
||||
t.Fatalf("expected %d rows, read back %d", n, len(seqs))
|
||||
}
|
||||
for i := 1; i < len(seqs); i++ {
|
||||
if seqs[i] <= seqs[i-1] {
|
||||
t.Fatalf("seq not strictly increasing in insert order: %v (IDENTITY collision / reuse)", seqs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_ActivityLogs_SeqBackfill_SinceIDOnBackfilledRow pins the
|
||||
// consequence the reviewer doubted: a row whose seq came from the migration /
|
||||
// IDENTITY (i.e. NOT explicitly set by the caller) is usable as a since_id
|
||||
// cursor, and a SECOND row sharing its exact created_at microsecond is returned
|
||||
// (not dropped). This proves the handler's (created_at, seq) tuple cursor
|
||||
// resolves a same-timestamp boundary that a created_at-only cursor would drop,
|
||||
// AND that the cursor row's seq is non-null (else the handler could not build
|
||||
// the tuple at all).
|
||||
//
|
||||
// Distinct from _BoundaryRowSameMicrosecondNotSkipped in the ordering test:
|
||||
// here the explicit angle under test is "the cursor row's seq is a
|
||||
// migration/IDENTITY-assigned (backfilled-style) value, non-null, and the
|
||||
// handler uses it" — i.e. the backfill behavior is what makes the boundary
|
||||
// resolution work, pinned head-on.
|
||||
func TestIntegration_ActivityLogs_SeqBackfill_SinceIDOnBackfilledRow(t *testing.T) {
|
||||
conn := integrationDB_ActivityDelegationA2A(t)
|
||||
_ = conn
|
||||
wsID := seedWorkspace(t, conn, "test-2151-seq-backfill-sinceid")
|
||||
|
||||
tSame := time.Date(2026, 6, 4, 10, 0, 0, 0, time.UTC)
|
||||
// Cursor row: seq comes purely from the IDENTITY default (never set by
|
||||
// the caller) — the same assignment mechanism the migration uses to
|
||||
// backfill pre-existing rows. The "next" row shares the exact created_at
|
||||
// microsecond and is inserted afterward, so it gets a strictly higher seq.
|
||||
cursorID := seedActivityRowAt(t, wsID, "sinceid-cursor", tSame)
|
||||
nextID := seedActivityRowAt(t, wsID, "sinceid-next-same-us", tSame)
|
||||
|
||||
// Prove the precondition the reviewer doubted: the cursor row's seq is
|
||||
// NON-NULL, so the handler can read it to build the (created_at, seq)
|
||||
// tuple. If it were NULL the handler's cursor lookup would yield a NULL
|
||||
// seq and the strictly-after tuple comparison would mis-behave.
|
||||
var cursorSeq *int64
|
||||
if err := db.DB.QueryRowContext(context.Background(),
|
||||
`SELECT seq FROM activity_logs WHERE id = $1`, cursorID,
|
||||
).Scan(&cursorSeq); err != nil {
|
||||
t.Fatalf("read cursor seq: %v", err)
|
||||
}
|
||||
if cursorSeq == nil {
|
||||
t.Fatal("cursor row has NULL seq — a since_id cursor on a backfilled-style row would be unusable")
|
||||
}
|
||||
|
||||
h := NewActivityHandler(nil)
|
||||
c, w := newTestGinContext()
|
||||
c.Params = gin.Params{{Key: "id", Value: wsID}}
|
||||
q := c.Request.URL.Query()
|
||||
q.Set("since_id", cursorID)
|
||||
q.Set("type", "a2a_receive")
|
||||
q.Set("limit", "10")
|
||||
c.Request.URL.RawQuery = q.Encode()
|
||||
|
||||
h.List(c)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("List returned %d, want 200: %s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp []map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
// Exactly the one same-microsecond row after the cursor — present (not
|
||||
// dropped by a strict created_at-only filter) and the cursor itself
|
||||
// excluded (strictly-after on the full tuple).
|
||||
if len(resp) != 1 {
|
||||
t.Fatalf("same-microsecond row after backfilled-style cursor dropped: expected 1 row, got %d: %+v",
|
||||
len(resp), resp)
|
||||
}
|
||||
if got, _ := resp[0]["id"].(string); got != nextID {
|
||||
t.Fatalf("expected boundary row id %s, got %s", nextID, got)
|
||||
}
|
||||
}
|
||||
@@ -1,162 +0,0 @@
|
||||
//go:build integration
|
||||
// +build integration
|
||||
|
||||
// activity_since_id_ordering_integration_test.go — REAL Postgres proof that
|
||||
// the poll-mode since_id activity feed (#2339) is DETERMINISTICALLY ordered
|
||||
// even when multiple rows collide on the same created_at microsecond.
|
||||
//
|
||||
// This is the test that the original bug report mis-labeled a "flake".
|
||||
// sqlmock cannot catch it: sqlmock returns rows in the order the test stuffs
|
||||
// them, so it can never reveal a non-deterministic ORDER BY. Only a real
|
||||
// planner over real same-created_at rows exposes it.
|
||||
//
|
||||
// Run with (same harness as activity_delegation_a2a_integration_test.go):
|
||||
//
|
||||
// docker run --rm -d --name pg-integration \
|
||||
// -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
|
||||
// -p 55432:5432 postgres:15-alpine
|
||||
// sleep 4
|
||||
// # apply migrations (incl. 20260604000000_activity_logs_seq.up.sql) then:
|
||||
// INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
|
||||
// go test -tags=integration ./internal/handlers/ -run Integration_SinceID
|
||||
//
|
||||
// WATCH-IT-FAIL: against the pre-fix handler (ORDER BY created_at only, no
|
||||
// seq tiebreaker, and `created_at > cursor` strict) this test is unstable —
|
||||
// the equal-created_at rows come back in arbitrary planner order so the
|
||||
// ordered-id assertion fails intermittently, and the same-microsecond
|
||||
// boundary row is dropped so the count assertion fails. With the fix
|
||||
// (ORDER BY created_at, seq + tuple cursor) it is green every run.
|
||||
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// seedActivityRowAt inserts one activity_logs row with an explicit created_at
|
||||
// (so the test can force microsecond-equal collisions) and a unique summary;
|
||||
// returns the generated id. seq is left to the IDENTITY default — Postgres
|
||||
// assigns it in INSERT order, which is the deterministic tiebreaker under test.
|
||||
// db.DB has been hot-swapped to the integration connection by
|
||||
// integrationDB_ActivityDelegationA2A(t) in the calling test.
|
||||
func seedActivityRowAt(t *testing.T, wsID, summary string, createdAt time.Time) string {
|
||||
t.Helper()
|
||||
var id string
|
||||
err := db.DB.QueryRowContext(context.Background(), `
|
||||
INSERT INTO activity_logs (workspace_id, activity_type, summary, status, created_at)
|
||||
VALUES ($1, 'a2a_receive', $2, 'ok', $3)
|
||||
RETURNING id
|
||||
`, wsID, summary, createdAt).Scan(&id)
|
||||
if err != nil {
|
||||
t.Fatalf("seedActivityRowAt(%q): %v", summary, err)
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
// TestIntegration_SinceID_StableOrderingSameMicrosecond proves the feed is
|
||||
// deterministic when rows share a created_at, AND that the same-microsecond
|
||||
// boundary row immediately after the cursor is NOT dropped.
|
||||
func TestIntegration_SinceID_StableOrderingSameMicrosecond(t *testing.T) {
|
||||
conn := integrationDB_ActivityDelegationA2A(t)
|
||||
_ = conn
|
||||
wsID := seedWorkspace(t, conn, "test-2151-sinceid-ordering")
|
||||
|
||||
// One earlier row to serve as the cursor (the "last processed" row).
|
||||
tCursor := time.Date(2026, 6, 4, 12, 0, 0, 0, time.UTC)
|
||||
cursorID := seedActivityRowAt(t, wsID, "cursor-row", tCursor)
|
||||
|
||||
// Three rows that ALL collide on the exact same created_at microsecond,
|
||||
// inserted in a known order. Pre-fix, ORDER BY created_at alone returns
|
||||
// these in arbitrary planner order.
|
||||
tEqual := time.Date(2026, 6, 4, 12, 0, 1, 0, time.UTC)
|
||||
idA := seedActivityRowAt(t, wsID, "equal-A", tEqual)
|
||||
idB := seedActivityRowAt(t, wsID, "equal-B", tEqual)
|
||||
idCc := seedActivityRowAt(t, wsID, "equal-C", tEqual)
|
||||
wantOrder := []string{idA, idB, idCc}
|
||||
|
||||
// Drive the handler exactly as a polling client would.
|
||||
h := NewActivityHandler(nil)
|
||||
c, w := newTestGinContext()
|
||||
c.Params = gin.Params{{Key: "id", Value: wsID}}
|
||||
q := c.Request.URL.Query()
|
||||
q.Set("since_id", cursorID)
|
||||
q.Set("type", "a2a_receive")
|
||||
q.Set("limit", "10")
|
||||
c.Request.URL.RawQuery = q.Encode()
|
||||
|
||||
h.List(c)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("List returned %d, want 200: %s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp []map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
|
||||
// All three equal-created_at rows must be present (boundary not dropped)
|
||||
// and the cursor row itself must be excluded (strictly-after).
|
||||
if len(resp) != len(wantOrder) {
|
||||
t.Fatalf("expected %d rows after cursor (the 3 equal-created_at rows), got %d: %+v",
|
||||
len(wantOrder), len(resp), resp)
|
||||
}
|
||||
|
||||
gotOrder := make([]string, len(resp))
|
||||
for i, row := range resp {
|
||||
idVal, _ := row["id"].(string)
|
||||
gotOrder[i] = idVal
|
||||
}
|
||||
for i := range wantOrder {
|
||||
if gotOrder[i] != wantOrder[i] {
|
||||
t.Fatalf("non-deterministic ordering: got id order %v, want %v (seq tiebreaker not applied)",
|
||||
gotOrder, wantOrder)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_SinceID_BoundaryRowSameMicrosecondNotSkipped isolates the
|
||||
// cursor-boundary bug: a row written in the SAME microsecond as the cursor
|
||||
// row (but with a higher seq) must still be returned. Pre-fix the strict
|
||||
// `created_at > cursor` filter silently dropped it.
|
||||
func TestIntegration_SinceID_BoundaryRowSameMicrosecondNotSkipped(t *testing.T) {
|
||||
conn := integrationDB_ActivityDelegationA2A(t)
|
||||
_ = conn
|
||||
wsID := seedWorkspace(t, conn, "test-2151-sinceid-boundary")
|
||||
|
||||
tSame := time.Date(2026, 6, 4, 13, 0, 0, 0, time.UTC)
|
||||
// Cursor row and the next row share the exact same created_at; the next
|
||||
// row is inserted afterwards so it gets a higher seq.
|
||||
cursorID := seedActivityRowAt(t, wsID, "boundary-cursor", tSame)
|
||||
nextID := seedActivityRowAt(t, wsID, "boundary-next-same-us", tSame)
|
||||
|
||||
h := NewActivityHandler(nil)
|
||||
c, w := newTestGinContext()
|
||||
c.Params = gin.Params{{Key: "id", Value: wsID}}
|
||||
q := c.Request.URL.Query()
|
||||
q.Set("since_id", cursorID)
|
||||
q.Set("type", "a2a_receive")
|
||||
q.Set("limit", "10")
|
||||
c.Request.URL.RawQuery = q.Encode()
|
||||
|
||||
h.List(c)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("List returned %d, want 200: %s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp []map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
if len(resp) != 1 {
|
||||
t.Fatalf("same-microsecond boundary row dropped: expected exactly the 1 next row, got %d rows: %+v",
|
||||
len(resp), resp)
|
||||
}
|
||||
if got, _ := resp[0]["id"].(string); got != nextID {
|
||||
t.Fatalf("expected boundary row id %s, got %s", nextID, got)
|
||||
}
|
||||
}
|
||||
@@ -26,21 +26,17 @@ func TestActivityHandler_SinceID_ReturnsNewerASC(t *testing.T) {
|
||||
|
||||
cursorID := "act-cursor-42"
|
||||
cursorTime := time.Date(2026, 4, 30, 5, 0, 0, 0, time.UTC)
|
||||
cursorSeq := int64(42)
|
||||
|
||||
// Step 1: cursor lookup — must include workspace_id scope so a UUID
|
||||
// from another workspace can't be used. Now resolves BOTH ordering-key
|
||||
// components (created_at, seq) so the strictly-after filter can compare
|
||||
// the full tuple.
|
||||
mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
// from another workspace can't be used.
|
||||
mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
WithArgs(cursorID, "ws-1").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"created_at", "seq"}).AddRow(cursorTime, cursorSeq))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(cursorTime))
|
||||
|
||||
// Step 2: main query with the cursor's (created_at, seq) as a tuple
|
||||
// strictly-after filter, (created_at, seq) ASC ordering.
|
||||
// Args: workspace_id, cursorTime, cursorSeq, limit.
|
||||
// Step 2: main query with the cursor's created_at as a > filter,
|
||||
// ASC ordering. Args: workspace_id, cursorTime, limit.
|
||||
mock.ExpectQuery("SELECT id, workspace_id, activity_type").
|
||||
WithArgs("ws-1", cursorTime, cursorSeq, 100).
|
||||
WithArgs("ws-1", cursorTime, 100).
|
||||
WillReturnRows(newActivityRows())
|
||||
|
||||
broadcaster := newTestBroadcaster()
|
||||
@@ -68,7 +64,7 @@ func TestActivityHandler_SinceID_ReturnsNewerASC(t *testing.T) {
|
||||
func TestActivityHandler_SinceID_CursorNotFound_410(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
WithArgs("act-gone", "ws-1").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
|
||||
@@ -100,7 +96,7 @@ func TestActivityHandler_SinceID_CrossWorkspaceCursor_410(t *testing.T) {
|
||||
|
||||
// Cursor exists in DB but the WHERE workspace_id = $2 filter excludes
|
||||
// it — sqlmock returns no rows, which is what Postgres would do.
|
||||
mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
WithArgs("act-other-ws", "ws-1").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
|
||||
@@ -124,23 +120,20 @@ func TestActivityHandler_SinceID_CrossWorkspaceCursor_410(t *testing.T) {
|
||||
|
||||
// TestActivityHandler_SinceID_CombinedWithSinceSecs: both filters apply
|
||||
// together (AND). Argument order in the main query: workspace_id,
|
||||
// since_secs, cursorTime, cursorSeq, limit. Sanity-checks the placeholder
|
||||
// index arithmetic in the query builder (the cursor now binds TWO args —
|
||||
// the (created_at, seq) tuple — so since_secs no longer shifts the tail by
|
||||
// one but by two).
|
||||
// since_secs, cursorTime, limit. Sanity-checks the placeholder index
|
||||
// arithmetic in the query builder.
|
||||
func TestActivityHandler_SinceID_CombinedWithSinceSecs(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
cursorID := "act-c"
|
||||
cursorTime := time.Date(2026, 4, 30, 4, 0, 0, 0, time.UTC)
|
||||
cursorSeq := int64(7)
|
||||
|
||||
mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
|
||||
WithArgs(cursorID, "ws-1").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"created_at", "seq"}).AddRow(cursorTime, cursorSeq))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(cursorTime))
|
||||
|
||||
mock.ExpectQuery("SELECT id, workspace_id, activity_type").
|
||||
WithArgs("ws-1", 600, cursorTime, cursorSeq, 100).
|
||||
WithArgs("ws-1", 600, cursorTime, 100).
|
||||
WillReturnRows(newActivityRows())
|
||||
|
||||
broadcaster := newTestBroadcaster()
|
||||
|
||||
@@ -54,29 +54,23 @@ func (h *ApprovalsHandler) Create(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
|
||||
"approval_id": approvalID,
|
||||
"action": body.Action,
|
||||
"reason": body.Reason,
|
||||
"task_id": body.TaskID,
|
||||
}); err != nil {
|
||||
log.Printf("approvals: failed to broadcast approval requested: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
// Auto-escalate to parent
|
||||
var parentID *string
|
||||
if err := db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID); err != nil {
|
||||
log.Printf("approvals: failed to lookup parent for escalation: %v", err)
|
||||
}
|
||||
db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID)
|
||||
if parentID != nil {
|
||||
if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
|
||||
"approval_id": approvalID,
|
||||
"from_workspace_id": workspaceID,
|
||||
"action": body.Action,
|
||||
"reason": body.Reason,
|
||||
}); err != nil {
|
||||
log.Printf("approvals: failed to broadcast approval escalated: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
c.JSON(http.StatusCreated, gin.H{"approval_id": approvalID, "status": "pending"})
|
||||
@@ -227,13 +221,11 @@ func (h *ApprovalsHandler) Decide(c *gin.Context) {
|
||||
eventType = "APPROVAL_DENIED"
|
||||
}
|
||||
|
||||
if err := h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
|
||||
h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
|
||||
"approval_id": approvalID,
|
||||
"decision": body.Decision,
|
||||
"decided_by": decidedBy,
|
||||
}); err != nil {
|
||||
log.Printf("approvals: failed to broadcast approval decision: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": body.Decision, "approval_id": approvalID})
|
||||
}
|
||||
|
||||
@@ -68,10 +68,6 @@ func TestPeers_CrossTenant_OrgRootNotLeaked(t *testing.T) {
|
||||
|
||||
caller := "org-a-root" // parent_id IS NULL — an org root for tenant A
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
// (Unordered match is set above, so this can be consumed at any point.)
|
||||
seedDiscoveryGrandfather(mock, caller)
|
||||
|
||||
// parent_id lookup → NULL (caller is an org root)
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs(caller).
|
||||
@@ -132,9 +128,6 @@ func TestPeers_SameOrg_SiblingsStillWork(t *testing.T) {
|
||||
caller := "org-a-child-1"
|
||||
parent := "org-a-root"
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, caller)
|
||||
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs(caller).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(parent))
|
||||
|
||||
@@ -102,10 +102,10 @@ func pushDelegationResultToInbox(ctx context.Context, sourceID, delegationID, st
|
||||
// and the A2A request runs in the background.
|
||||
type DelegationHandler struct {
|
||||
workspace *WorkspaceHandler
|
||||
broadcaster events.EventEmitter
|
||||
broadcaster *events.Broadcaster
|
||||
}
|
||||
|
||||
func NewDelegationHandler(wh *WorkspaceHandler, b events.EventEmitter) *DelegationHandler {
|
||||
func NewDelegationHandler(wh *WorkspaceHandler, b *events.Broadcaster) *DelegationHandler {
|
||||
return &DelegationHandler{workspace: wh, broadcaster: b}
|
||||
}
|
||||
|
||||
@@ -179,11 +179,8 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
|
||||
"message": map[string]interface{}{
|
||||
"role": "user",
|
||||
"messageId": delegationID,
|
||||
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251) —
|
||||
// a `type`-keyed Part is dropped by the receiver's v0.3
|
||||
// validator, silently losing the delegated task.
|
||||
"parts": []map[string]interface{}{{"kind": "text", "text": body.Task}},
|
||||
"metadata": map[string]interface{}{"delegation_id": delegationID},
|
||||
"parts": []map[string]interface{}{{"type": "text", "text": body.Task}},
|
||||
"metadata": map[string]interface{}{"delegation_id": delegationID},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
@@ -36,6 +36,7 @@ package handlers
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -56,7 +57,10 @@ import (
|
||||
// directly rather than going through the package global.
|
||||
func integrationDB(t *testing.T) *sql.DB {
|
||||
t.Helper()
|
||||
url := requireIntegrationDBURL(t)
|
||||
url := os.Getenv("INTEGRATION_DB_URL")
|
||||
if url == "" {
|
||||
t.Skip("INTEGRATION_DB_URL not set; skipping (local devs: see file header)")
|
||||
}
|
||||
conn, err := sql.Open("postgres", url)
|
||||
if err != nil {
|
||||
t.Fatalf("open: %v", err)
|
||||
|
||||
@@ -422,33 +422,28 @@ func (h *DiscoveryHandler) CheckAccess(c *gin.Context) {
|
||||
// workspaces with tokens must present a matching Bearer, token binding
|
||||
// is strict (A's token cannot authenticate caller B).
|
||||
//
|
||||
// (harden/no-fail-open-auth) Fails CLOSED on DB error. This used to return nil
|
||||
// (allow) on a HasAnyLiveToken hiccup "because discovery only exposes peer URLs
|
||||
// already behind CanCommunicate" — but the CTO "nothing fail-open" directive is
|
||||
// absolute, and a request must never gain access because the auth datastore is
|
||||
// unreachable. A datastore error now writes 503 (availability tradeoff that
|
||||
// grants NO access) and returns a non-nil error; the caller already does
|
||||
// `if err != nil { return }` so the 503 body is what the client sees.
|
||||
// Fail-open on DB hiccups. Unlike secrets.Values (which returns plaintext
|
||||
// secrets and must fail closed), discovery only exposes peer URLs that
|
||||
// are already behind the existing `CanCommunicate` hierarchy check — a
|
||||
// momentary DB outage shouldn't take agent-to-agent discovery offline.
|
||||
func validateDiscoveryCaller(ctx context.Context, c *gin.Context, workspaceID string) error {
|
||||
hasLive, err := wsauth.HasAnyLiveToken(ctx, db.DB, workspaceID)
|
||||
if err != nil {
|
||||
log.Printf("wsauth: discovery HasAnyLiveToken(%s): datastore lookup failed (returning 503): %v", workspaceID, err)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "platform datastore unavailable — retry shortly",
|
||||
"code": "platform_unavailable",
|
||||
})
|
||||
return errors.New("auth datastore unavailable")
|
||||
log.Printf("wsauth: discovery HasAnyLiveToken(%s) failed: %v — allowing request", workspaceID, err)
|
||||
return nil
|
||||
}
|
||||
if !hasLive {
|
||||
return nil // legacy / pre-upgrade
|
||||
}
|
||||
// (harden/no-fail-open-auth) The former dev-mode escape hatch that
|
||||
// returned nil (allow) here when MOLECULE_ENV=dev + ADMIN_TOKEN unset
|
||||
// has been REMOVED. Discovery callers must present a verified CP
|
||||
// session or a valid bearer in every environment. Local dev now
|
||||
// authenticates the Canvas with a provisioned ADMIN_TOKEN /
|
||||
// NEXT_PUBLIC_ADMIN_TOKEN (see scripts/dev-start.sh), so the Details
|
||||
// tab loads peers with a real credential rather than via fail-open.
|
||||
// Tier-1b dev-mode hatch — same escape hatch AdminAuth and
|
||||
// WorkspaceAuth apply on a local Docker setup. Without this, the
|
||||
// canvas Details tab can never load peers for a workspace that has
|
||||
// registered its live token, producing the 401 the user sees.
|
||||
// Gated by MOLECULE_ENV=development + empty ADMIN_TOKEN, so SaaS
|
||||
// production stays strict.
|
||||
if middleware.IsDevModeFailOpen() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Try session cookie auth first (SaaS canvas path).
|
||||
// verifiedCPSession returns (valid, presented):
|
||||
|
||||
@@ -49,10 +49,6 @@ func TestDiscover_WorkspaceNotFound_WithCaller(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first;
|
||||
// grandfather (count=0) so the bearer-less request is allowed through.
|
||||
seedDiscoveryGrandfather(mock, "ws-caller")
|
||||
|
||||
// CanCommunicate will need DB lookups — both workspace name lookups
|
||||
// For the access check: caller lookup succeeds, target lookup fails
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
@@ -117,9 +113,6 @@ func TestPeers_WithParent(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-sibling-1")
|
||||
|
||||
// Expect parent_id lookup for the requesting workspace
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-sibling-1").
|
||||
@@ -172,9 +165,6 @@ func TestPeers_NotFound(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-ghost")
|
||||
|
||||
// Workspace not found
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-ghost").
|
||||
@@ -201,11 +191,6 @@ func TestPeers_DBError(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// Auth probe grandfathers; this test targets a DB error on the
|
||||
// *handler-body* parent_id query → 500 (distinct from the auth-probe
|
||||
// DB error which now fails closed with 503).
|
||||
seedDiscoveryGrandfather(mock, "ws-dberr")
|
||||
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-dberr").
|
||||
WillReturnError(sql.ErrConnDone)
|
||||
@@ -231,9 +216,6 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-root-alone")
|
||||
|
||||
// Root workspace (parent_id is NULL)
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-root-alone").
|
||||
@@ -288,9 +270,6 @@ func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-self")
|
||||
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-self").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-pm"))
|
||||
@@ -948,14 +927,13 @@ func TestDiscoverHostPeer_Smoke_Success(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== Peers auth — fail-CLOSED gate ====================
|
||||
// ==================== Peers auth — dev-mode fail-open gate ====================
|
||||
//
|
||||
// (harden/no-fail-open-auth) validateDiscoveryCaller USED to apply a
|
||||
// Tier-1b dev-mode hatch that let the bearer-less canvas session load the
|
||||
// Details → PEERS list when MOLECULE_ENV=development AND ADMIN_TOKEN empty.
|
||||
// That hatch has been REMOVED — discovery callers must present a verified
|
||||
// CP session or a valid bearer in every environment. These tests pin the
|
||||
// fail-closed contract against accidental re-introduction.
|
||||
// validateDiscoveryCaller applies a Tier-1b dev-mode hatch so the canvas
|
||||
// user session (which holds no workspace-scoped bearer) can still load
|
||||
// the Details → PEERS list on a local Docker setup. The gate must pass
|
||||
// ONLY when MOLECULE_ENV is development AND ADMIN_TOKEN is empty.
|
||||
// These tests pin that contract against accidental polarity flips.
|
||||
|
||||
// peersAuthFixtureHasLiveToken seeds the mock rows required for the
|
||||
// Peers handler to reach the auth branch: HasAnyLiveToken → true (a
|
||||
@@ -968,30 +946,10 @@ func peersAuthFixtureHasLiveToken(mock sqlmock.Sqlmock, workspaceID string) {
|
||||
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
|
||||
}
|
||||
|
||||
// seedDiscoveryGrandfather seeds the FIRST query validateDiscoveryCaller
|
||||
// issues (HasAnyLiveToken → 0 = legacy / pre-upgrade) so a bearer-less
|
||||
// discovery request grandfathers through and the test can exercise the
|
||||
// handler body.
|
||||
//
|
||||
// (harden/no-fail-open-auth) Before this branch, validateDiscoveryCaller
|
||||
// returned nil (allow) when the HasAnyLiveToken probe ERRORED — so these
|
||||
// handler-body tests never had to seed the probe at all; the unmatched
|
||||
// COUNT query erred and the fail-open swallowed it. Now that the DB-error
|
||||
// path fails CLOSED (503), the probe must be seeded explicitly. count=0 is
|
||||
// the legitimate grandfather path (no live tokens for this workspace yet),
|
||||
// which is what these pre-existing tests intend.
|
||||
func seedDiscoveryGrandfather(mock sqlmock.Sqlmock, workspaceID string) {
|
||||
mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens").
|
||||
WithArgs(workspaceID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
|
||||
}
|
||||
|
||||
func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) {
|
||||
// (harden/no-fail-open-auth) Exact old-hatch conditions:
|
||||
// MOLECULE_ENV=development AND ADMIN_TOKEN empty, with a live token in
|
||||
// the DB. The bearer-less canvas-style request must now 401 — the
|
||||
// dev-mode hatch that returned nil (allow) here is gone. Local dev
|
||||
// authenticates via a provisioned ADMIN_TOKEN (scripts/dev-start.sh).
|
||||
func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) {
|
||||
// Dev mode: MOLECULE_ENV=development AND ADMIN_TOKEN empty. Canvas
|
||||
// sends no bearer token; validateDiscoveryCaller must return nil
|
||||
// (allow) and the handler must proceed to return the peer list.
|
||||
t.Setenv("MOLECULE_ENV", "development")
|
||||
t.Setenv("ADMIN_TOKEN", "")
|
||||
|
||||
@@ -999,10 +957,22 @@ func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// Only the HasAnyLiveToken probe runs; auth 401s before the peer
|
||||
// queries, so no further expectations are seeded.
|
||||
peersAuthFixtureHasLiveToken(mock, "ws-dev")
|
||||
|
||||
// Root workspace → children+parent queries still fire but the
|
||||
// parent_id lookup comes first.
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-dev").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
|
||||
peerCols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}
|
||||
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id IS NULL AND w.id").
|
||||
WithArgs("ws-dev").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols))
|
||||
// #383 — children query gained explicit `w.id != $2` self-filter.
|
||||
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
|
||||
WithArgs("ws-dev", "ws-dev").
|
||||
WillReturnRows(sqlmock.NewRows(peerCols))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-dev"}}
|
||||
@@ -1010,8 +980,8 @@ func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) {
|
||||
|
||||
handler.Peers(c)
|
||||
|
||||
if w.Code != http.StatusUnauthorized {
|
||||
t.Fatalf("expected 401 (fail-closed) under old dev-mode hatch conditions, got %d: %s", w.Code, w.Body.String())
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200 under dev-mode hatch, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1064,70 +1034,6 @@ func TestPeers_DevModeFailOpen_ClosedInProduction(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestPeers_AuthProbeDBError_FailsClosed pins the removal of
|
||||
// validateDiscoveryCaller's fail-open-on-DB-error branch
|
||||
// (harden/no-fail-open-auth). When the HasAnyLiveToken auth probe ERRORS, the
|
||||
// request must NOT be allowed through — it now returns 503 (availability
|
||||
// tradeoff that grants NO access). Before this branch the function returned nil
|
||||
// (allow) on a DB hiccup, so the request reached the peer queries.
|
||||
//
|
||||
// Watch-it-fail: restore `if err != nil { log; return nil }` in
|
||||
// validateDiscoveryCaller → this flips 503→(200/handler path) and fails.
|
||||
func TestPeers_AuthProbeDBError_FailsClosed(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// The FIRST query validateDiscoveryCaller issues (HasAnyLiveToken) errors.
|
||||
// No further expectations: a fail-closed 503 must be written before the
|
||||
// peer-list queries run.
|
||||
mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens").
|
||||
WithArgs("ws-dberr-auth").
|
||||
WillReturnError(sql.ErrConnDone)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-dberr-auth"}}
|
||||
c.Request = httptest.NewRequest("GET", "/registry/ws-dberr-auth/peers", nil)
|
||||
|
||||
handler.Peers(c)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("auth-probe DB error must fail CLOSED: expected 503, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiscover_AuthProbeDBError_FailsClosed is the Discover-endpoint companion
|
||||
// to TestPeers_AuthProbeDBError_FailsClosed: a HasAnyLiveToken error on the
|
||||
// caller's discovery request fails CLOSED with 503 (was: fail-open allow).
|
||||
func TestDiscover_AuthProbeDBError_FailsClosed(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens").
|
||||
WithArgs("ws-caller").
|
||||
WillReturnError(sql.ErrConnDone)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-target"}}
|
||||
c.Request = httptest.NewRequest("GET", "/registry/discover/ws-target", nil)
|
||||
c.Request.Header.Set("X-Workspace-ID", "ws-caller")
|
||||
|
||||
handler.Discover(c)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("Discover auth-probe DB error must fail CLOSED: expected 503, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== Peers — #383 self never appears in result ====================
|
||||
|
||||
// TestPeers_ExcludeSelf_DefenseInDepth verifies the final-line filter in
|
||||
@@ -1150,9 +1056,6 @@ func TestPeers_ExcludeSelf_DefenseInDepth(t *testing.T) {
|
||||
|
||||
const selfID = "ws-xiaodong"
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, selfID)
|
||||
|
||||
// parent_id lookup — workspace has a parent.
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs(selfID).
|
||||
|
||||
@@ -551,9 +551,6 @@ func TestDiscover_AccessDenied(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-child-a")
|
||||
|
||||
// CanCommunicate: different parents → denied
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-child-a").
|
||||
@@ -585,9 +582,6 @@ func TestDiscover_TargetOffline(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-caller")
|
||||
|
||||
// Share a parent so communication is allowed under post-#1955 rules
|
||||
sharedParent := "ws-parent"
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
|
||||
@@ -373,9 +373,6 @@ func TestExtended_DiscoverWithCallerID(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-caller")
|
||||
|
||||
// CanCommunicate needs to look up both workspaces
|
||||
// Share a parent so communication is allowed under post-#1955 rules
|
||||
sharedParent := "ws-parent"
|
||||
@@ -467,9 +464,6 @@ func TestExtended_Peers(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
|
||||
seedDiscoveryGrandfather(mock, "ws-peer")
|
||||
|
||||
// Expect parent_id lookup for requesting workspace (root-level, no parent)
|
||||
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-peer").
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
//go:build integration
|
||||
// +build integration
|
||||
|
||||
// integration_helper_test.go — shared preflight for handler Postgres
|
||||
// integration tests. Extracted so the fail-open/skip logic is in ONE place
|
||||
// and can be tightened without editing every integration test file.
|
||||
//
|
||||
// See delegation_ledger_integration_test.go for the docker-postgres setup
|
||||
// incantation used by local devs.
|
||||
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// requireIntegrationDBURL returns $INTEGRATION_DB_URL.
|
||||
//
|
||||
// In CI (CI, GITHUB_ACTIONS, or GITEA_ACTIONS env var is non-empty), an
|
||||
// empty URL is a fatal error — it means the workflow failed to export the
|
||||
// variable (postgres container did not start, bridge IP resolution failed,
|
||||
// or a regression in the workflow YAML). t.Fatalf keeps the test red so the
|
||||
// failure is visible; t.Skip would silently pass and mask the defect.
|
||||
//
|
||||
// Locally (none of the three CI markers set), an empty URL skips the test
|
||||
// so devs can run `go test ./...` without booting a Postgres container.
|
||||
func requireIntegrationDBURL(t *testing.T) string {
|
||||
t.Helper()
|
||||
url := os.Getenv("INTEGRATION_DB_URL")
|
||||
if url == "" {
|
||||
if os.Getenv("CI") != "" ||
|
||||
os.Getenv("GITHUB_ACTIONS") != "" ||
|
||||
os.Getenv("GITEA_ACTIONS") != "" {
|
||||
t.Fatalf("INTEGRATION_DB_URL required in CI handler integration tests — check workflow env export")
|
||||
}
|
||||
t.Skip("INTEGRATION_DB_URL not set; skipping (local devs: see file header)")
|
||||
}
|
||||
return url
|
||||
}
|
||||
@@ -126,32 +126,6 @@ var mcpAllTools = []mcpTool{
|
||||
"type": "string",
|
||||
"description": "The task description to send to the target workspace",
|
||||
},
|
||||
"attachments": map[string]interface{}{
|
||||
"type": "array",
|
||||
"description": "Optional files to send with the task. Each item must include uri and name; mimeType and size are optional.",
|
||||
"items": map[string]interface{}{
|
||||
"type": "object",
|
||||
"properties": map[string]interface{}{
|
||||
"uri": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Workspace attachment URI, usually workspace:/absolute/path",
|
||||
},
|
||||
"name": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Display filename",
|
||||
},
|
||||
"mimeType": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Optional MIME type",
|
||||
},
|
||||
"size": map[string]interface{}{
|
||||
"type": "number",
|
||||
"description": "Optional file size in bytes",
|
||||
},
|
||||
},
|
||||
"required": []string{"uri", "name"},
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": []string{"workspace_id", "task"},
|
||||
},
|
||||
@@ -170,32 +144,6 @@ var mcpAllTools = []mcpTool{
|
||||
"type": "string",
|
||||
"description": "The task description to send to the target workspace",
|
||||
},
|
||||
"attachments": map[string]interface{}{
|
||||
"type": "array",
|
||||
"description": "Optional files to send with the task. Each item must include uri and name; mimeType and size are optional.",
|
||||
"items": map[string]interface{}{
|
||||
"type": "object",
|
||||
"properties": map[string]interface{}{
|
||||
"uri": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Workspace attachment URI, usually workspace:/absolute/path",
|
||||
},
|
||||
"name": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Display filename",
|
||||
},
|
||||
"mimeType": map[string]interface{}{
|
||||
"type": "string",
|
||||
"description": "Optional MIME type",
|
||||
},
|
||||
"size": map[string]interface{}{
|
||||
"type": "number",
|
||||
"description": "Optional file size in bytes",
|
||||
},
|
||||
},
|
||||
"required": []string{"uri", "name"},
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": []string{"workspace_id", "task"},
|
||||
},
|
||||
|
||||
@@ -285,121 +285,6 @@ func TestMCPHandler_DelegateTaskAsync_RoutesThroughPlatformA2AProxy(t *testing.T
|
||||
// goroutine returns early and never calls proxyA2ARequest with a nil/empty
|
||||
// body. Before the fix the goroutine logged the error and fell through,
|
||||
// dispatching a malformed A2A request.
|
||||
|
||||
func TestMCPHandler_DelegateTask_WithAttachments(t *testing.T) {
|
||||
h, mock := newMCPHandler(t)
|
||||
callerID := "11111111-1111-1111-1111-111111111111"
|
||||
targetID := "22222222-2222-2222-2222-222222222222"
|
||||
parentID := "33333333-3333-3333-3333-333333333333"
|
||||
|
||||
expectCanCommunicateSiblings(mock, callerID, targetID, parentID)
|
||||
mock.ExpectExec(`(?s)INSERT INTO activity_logs.*'delegation'.*'delegate'`).
|
||||
WithArgs(callerID, callerID, targetID, "Delegating to "+targetID, sqlmock.AnyArg(), "pending").
|
||||
WillReturnResult(sqlmock.NewResult(1, 1))
|
||||
mock.ExpectExec(`UPDATE activity_logs`).
|
||||
WithArgs("dispatched", "", callerID, sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
h.a2aProxy = func(ctx context.Context, workspaceID string, body []byte, proxyCallerID string, logActivity bool) (int, []byte, error) {
|
||||
if workspaceID != targetID || proxyCallerID != callerID {
|
||||
t.Fatalf("unexpected proxy route target=%q caller=%q", workspaceID, proxyCallerID)
|
||||
}
|
||||
bodyStr := string(body)
|
||||
if !strings.Contains(bodyStr, `"text":"review this video"`) {
|
||||
t.Fatalf("A2A body missing task text: %s", bodyStr)
|
||||
}
|
||||
if !strings.Contains(bodyStr, `"kind":"video"`) {
|
||||
t.Fatalf("A2A body missing video attachment kind: %s", bodyStr)
|
||||
}
|
||||
if !strings.Contains(bodyStr, `"uri":"workspace:/tmp/clip.mp4"`) {
|
||||
t.Fatalf("A2A body missing attachment uri: %s", bodyStr)
|
||||
}
|
||||
if !strings.Contains(bodyStr, `"mime_type":"video/mp4"`) {
|
||||
t.Fatalf("A2A body missing attachment mime_type: %s", bodyStr)
|
||||
}
|
||||
return 200, []byte(`{"result":{"message":{"parts":[{"text":"done"}]}}}`), nil
|
||||
}
|
||||
|
||||
out, err := h.toolDelegateTask(context.Background(), callerID, map[string]interface{}{
|
||||
"workspace_id": targetID,
|
||||
"task": "review this video",
|
||||
"attachments": []interface{}{
|
||||
map[string]interface{}{
|
||||
"uri": "workspace:/tmp/clip.mp4",
|
||||
"name": "clip.mp4",
|
||||
"mimeType": "video/mp4",
|
||||
"size": 12345,
|
||||
},
|
||||
},
|
||||
}, mcpCallTimeout)
|
||||
if err != nil {
|
||||
t.Fatalf("delegate_task returned error: %v", err)
|
||||
}
|
||||
if out != "done" {
|
||||
t.Fatalf("delegate_task response = %q, want done", out)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMCPHandler_DelegateTaskAsync_WithAttachments(t *testing.T) {
|
||||
h, mock := newMCPHandler(t)
|
||||
callerID := "11111111-1111-1111-1111-111111111111"
|
||||
targetID := "22222222-2222-2222-2222-222222222222"
|
||||
parentID := "33333333-3333-3333-3333-333333333333"
|
||||
|
||||
expectCanCommunicateSiblings(mock, callerID, targetID, parentID)
|
||||
mock.ExpectExec(`(?s)INSERT INTO activity_logs.*'delegation'.*'delegate'`).
|
||||
WithArgs(callerID, callerID, targetID, "Delegating to "+targetID, sqlmock.AnyArg(), "pending").
|
||||
WillReturnResult(sqlmock.NewResult(1, 1))
|
||||
mock.ExpectExec(`UPDATE activity_logs`).
|
||||
WithArgs("dispatched", "", callerID, sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
called := make(chan []byte, 1)
|
||||
h.a2aProxy = func(ctx context.Context, workspaceID string, body []byte, proxyCallerID string, logActivity bool) (int, []byte, error) {
|
||||
if workspaceID != targetID || proxyCallerID != callerID {
|
||||
t.Fatalf("unexpected proxy route target=%q caller=%q", workspaceID, proxyCallerID)
|
||||
}
|
||||
called <- body
|
||||
return 200, []byte(`{"result":{"message":{"parts":[{"text":"accepted"}]}}}`), nil
|
||||
}
|
||||
|
||||
out, err := h.toolDelegateTaskAsync(context.Background(), callerID, map[string]interface{}{
|
||||
"workspace_id": targetID,
|
||||
"task": "async work with image",
|
||||
"attachments": []interface{}{
|
||||
map[string]interface{}{
|
||||
"uri": "workspace:/tmp/screenshot.png",
|
||||
"name": "screenshot.png",
|
||||
"mimeType": "image/png",
|
||||
},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("delegate_task_async returned error: %v", err)
|
||||
}
|
||||
if !strings.Contains(out, `"status":"dispatched"`) {
|
||||
t.Fatalf("delegate_task_async response = %s", out)
|
||||
}
|
||||
waitGlobalAsyncForTest()
|
||||
select {
|
||||
case body := <-called:
|
||||
bodyStr := string(body)
|
||||
if !strings.Contains(bodyStr, `"kind":"image"`) {
|
||||
t.Fatalf("A2A body missing image attachment kind: %s", bodyStr)
|
||||
}
|
||||
if !strings.Contains(bodyStr, `"uri":"workspace:/tmp/screenshot.png"`) {
|
||||
t.Fatalf("A2A body missing attachment uri: %s", bodyStr)
|
||||
}
|
||||
default:
|
||||
t.Fatal("async delegate did not call platform A2A proxy")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
func TestMCPHandler_DelegateTaskAsync_MarshalFailureDoesNotCallProxy(t *testing.T) {
|
||||
h, mock := newMCPHandler(t)
|
||||
callerID := "11111111-1111-1111-1111-111111111111"
|
||||
|
||||
@@ -187,32 +187,6 @@ func (h *MCPHandler) toolGetWorkspaceInfo(ctx context.Context, workspaceID strin
|
||||
return string(b), nil
|
||||
}
|
||||
|
||||
// buildA2AMessageParts constructs the A2A message parts array from a task string
|
||||
// and optional attachments. The text part always comes first; attachment parts
|
||||
// follow in the order provided, with kind derived from MIME type.
|
||||
func buildA2AMessageParts(task string, attachments []AgentMessageAttachment) []map[string]interface{} {
|
||||
parts := []map[string]interface{}{
|
||||
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251).
|
||||
// The receiver's v0.3 Pydantic validator drops a Part keyed
|
||||
// `type`, silently losing the task text — the file part below
|
||||
// already uses `kind`, this is the matching fix for text.
|
||||
{"kind": "text", "text": task},
|
||||
}
|
||||
for _, att := range attachments {
|
||||
kind := kindFromMimeType(att.MimeType)
|
||||
filePart := map[string]interface{}{
|
||||
"kind": kind,
|
||||
"file": map[string]interface{}{
|
||||
"uri": att.URI,
|
||||
"mime_type": att.MimeType,
|
||||
"name": att.Name,
|
||||
},
|
||||
}
|
||||
parts = append(parts, filePart)
|
||||
}
|
||||
return parts
|
||||
}
|
||||
|
||||
func (h *MCPHandler) toolDelegateTask(ctx context.Context, callerID string, args map[string]interface{}, timeout time.Duration) (string, error) {
|
||||
targetID, _ := args["workspace_id"].(string)
|
||||
task, _ := args["task"].(string)
|
||||
@@ -234,8 +208,6 @@ func (h *MCPHandler) toolDelegateTask(ctx context.Context, callerID string, args
|
||||
// Non-fatal: still make the A2A call even if activity log write fails.
|
||||
}
|
||||
|
||||
attachments, _ := parseAgentMessageAttachments(args["attachments"])
|
||||
|
||||
a2aBody, err := json.Marshal(map[string]interface{}{
|
||||
"jsonrpc": "2.0",
|
||||
"id": uuid.New().String(),
|
||||
@@ -243,7 +215,7 @@ func (h *MCPHandler) toolDelegateTask(ctx context.Context, callerID string, args
|
||||
"params": map[string]interface{}{
|
||||
"message": map[string]interface{}{
|
||||
"role": "user",
|
||||
"parts": buildA2AMessageParts(task, attachments),
|
||||
"parts": []map[string]interface{}{{"type": "text", "text": task}},
|
||||
"messageId": uuid.New().String(),
|
||||
},
|
||||
},
|
||||
@@ -303,8 +275,6 @@ func (h *MCPHandler) toolDelegateTaskAsync(ctx context.Context, callerID string,
|
||||
bgCtx, cancel := context.WithTimeout(context.Background(), mcpAsyncCallTimeout)
|
||||
defer cancel()
|
||||
|
||||
attachments, _ := parseAgentMessageAttachments(args["attachments"])
|
||||
|
||||
a2aBody, marshalErr := marshalA2ABody(map[string]interface{}{
|
||||
"jsonrpc": "2.0",
|
||||
"id": delegationID,
|
||||
@@ -312,7 +282,7 @@ func (h *MCPHandler) toolDelegateTaskAsync(ctx context.Context, callerID string,
|
||||
"params": map[string]interface{}{
|
||||
"message": map[string]interface{}{
|
||||
"role": "user",
|
||||
"parts": buildA2AMessageParts(task, attachments),
|
||||
"parts": []map[string]interface{}{{"type": "text", "text": task}},
|
||||
"messageId": uuid.New().String(),
|
||||
},
|
||||
},
|
||||
|
||||
@@ -17,30 +17,19 @@ package handlers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// validateRegisteredModelForRuntime reports whether (runtime, model) is
|
||||
// selectable per the provider registry. Returns:
|
||||
//
|
||||
// (true, "") — allowed: model is on the runtime's platform menu
|
||||
// (ModelsForRuntime) OR DeriveProvider(runtime, model)
|
||||
// RESOLVES a native provider (the cp#529 routability-aware
|
||||
// BYOK path), OR the runtime is not in the registry
|
||||
// (fail-open), OR model=="".
|
||||
// (false, reason) — rejected: the runtime IS registered, the model is not on
|
||||
// its platform menu, AND no native provider prefix-owns it
|
||||
// (genuinely unroutable).
|
||||
// (true, "") — allowed: model is registered for this runtime, OR the
|
||||
// runtime is not in the registry (fail-open), OR model=="".
|
||||
// (false, reason) — rejected: the runtime IS registered but the model is not
|
||||
// in its native ModelsForRuntime set.
|
||||
//
|
||||
// model=="" is allowed here: the MODEL_REQUIRED gate owns the empty-model case,
|
||||
// so this validator must not double-reject it.
|
||||
//
|
||||
// ROUTABILITY-AWARE (cp#529, CTO Option C): the final predicate is an OR —
|
||||
// `model ∈ ModelsForRuntime(runtime)` OR `DeriveProvider(runtime, model, nil)`
|
||||
// resolves. The platform menu carries platform-billed ids; the DeriveProvider
|
||||
// path covers BYOK ids that prefix-match a name-only native arm (no platform
|
||||
// billing). The drift checker in molecule-controlplane mirrors this exact OR.
|
||||
func validateRegisteredModelForRuntime(runtime, model string) (bool, string) {
|
||||
model = strings.TrimSpace(model)
|
||||
if model == "" {
|
||||
@@ -62,117 +51,7 @@ func validateRegisteredModelForRuntime(runtime, model string) (bool, string) {
|
||||
return true, ""
|
||||
}
|
||||
}
|
||||
// ROUTABILITY-AWARE allow path (cp#529, CTO-approved Option C). The model is
|
||||
// NOT on the runtime's platform menu (ModelsForRuntime) — but a model can be
|
||||
// legitimately SELECTABLE without being a platform-menu id: a BYOK id whose
|
||||
// prefix matches one of the runtime's NATIVE provider arms (a name-only arm
|
||||
// added in providers.yaml) resolves to a concrete provider via DeriveProvider
|
||||
// even though it carries no platform billing. Allow it iff DeriveProvider
|
||||
// resolves a provider for (runtime, model). A genuinely-unroutable id (no
|
||||
// native provider prefix-owns it) still falls through to the 422 below.
|
||||
//
|
||||
// BILLING GUARDRAIL: only CONFIRMED-NON-PLATFORM (BYOK) providers are wired as
|
||||
// name-only arms in providers.yaml (never platform/anthropic-*/openai-*/
|
||||
// moonshot/minimax/google/vertex), so a DeriveProvider-resolved id reached by
|
||||
// THIS path can never bill the platform's key for a customer's model. The
|
||||
// platform-menu ids that DO carry platform billing are already allowed by the
|
||||
// exact-membership loop above; this path only ever resolves to a BYOK arm.
|
||||
if _, derr := m.DeriveProvider(runtime, model, nil); derr == nil {
|
||||
return true, ""
|
||||
}
|
||||
return false, fmt.Sprintf(
|
||||
"model %q is not a registered model for runtime %q; pick one of the runtime's registered models (provider-registry SSOT, internal#718)",
|
||||
model, runtime)
|
||||
}
|
||||
|
||||
// validateDerivedProviderInRegistry (issue #2172) is the provider-side companion
|
||||
// to validateRegisteredModelForRuntime. The model-side check asks "is this
|
||||
// (runtime, model) in the registry?"; the provider-side check asks "is the
|
||||
// provider this model DERIVES to — the same one the adapter will resolve at
|
||||
// boot — a known provider in providers.yaml?"
|
||||
//
|
||||
// Live trigger (adk-demo Assistant, 2026-06-03): workspace config
|
||||
// `model=moonshot/kimi-k2.6` (claude-code) → adapter derives `provider=moonshot`
|
||||
// → `ValueError: provider=moonshot not in providers registry` at BOOT. The
|
||||
// save was accepted (no validation at the API boundary), and the failure only
|
||||
// surfaced when the agent tried to register. CI never saw it. The drift gate
|
||||
// (RFC#580) validates TEMPLATES against the registry, NOT per-workspace
|
||||
// configs; the existing model-side check rejects a model the runtime doesn't
|
||||
// own but says nothing about the DERIVED provider's registry membership.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// (true, "") — pass: model is empty (MODEL_REQUIRED owns it), the
|
||||
// runtime is not in the registry (fail-open for
|
||||
// federated / non-first-party runtimes — mirror of the
|
||||
// model-side check's federation contract), the registry
|
||||
// failed to load (build-time gate owns it), OR the
|
||||
// derived provider name is a known provider in the
|
||||
// registry's `providers:` list.
|
||||
// (false, reason) — reject: a known (runtime, model) pair derives to a
|
||||
// provider name absent from the providers list. This is
|
||||
// the structural class the adk-demo boot failure belongs
|
||||
// to — the registry's `runtimes:` block references a
|
||||
// provider not declared in `providers:`, which by
|
||||
// construction is a registry-data bug. Catching it at
|
||||
// config-SAVE keeps it out of the agent-boot path.
|
||||
//
|
||||
// Defense-in-depth: by construction, a model in a runtime's native provider set
|
||||
// has a provider that IS in the catalog (the runtime ref names a provider from
|
||||
// the providers list). So the rejection path is primarily a registry-consistency
|
||||
// guard. The real value is the FAIL-LOUD semantics — any future drift between
|
||||
// `providers:` and `runtimes:` fails the create call with a clear pointer to
|
||||
// the missing provider, instead of silently wedging the agent at boot.
|
||||
func validateDerivedProviderInRegistry(runtime, model string) (bool, string) {
|
||||
model = strings.TrimSpace(model)
|
||||
if model == "" {
|
||||
return true, "" // MODEL_REQUIRED owns this.
|
||||
}
|
||||
m, err := providerRegistry()
|
||||
if err != nil || m == nil {
|
||||
// Registry unavailable (build-time defect the gates catch). Fail open —
|
||||
// do not block create on a registry-load failure.
|
||||
return true, ""
|
||||
}
|
||||
// DeriveProvider is fail-closed for unknown runtimes. Mirror the
|
||||
// model-side check's federation contract: a runtime the registry does
|
||||
// NOT know (langgraph / external / kimi / mock / federated) is allowed
|
||||
// to pass through. DeriveProvider's `unknown runtime` error IS that
|
||||
// signal — treat it as fail-open, identical to ModelsForRuntime's
|
||||
// not-found behavior above.
|
||||
p, err := m.DeriveProvider(runtime, model, nil)
|
||||
if err != nil {
|
||||
// Either the runtime is unknown (fail-open by contract) OR the model
|
||||
// is not native to the runtime (the model-side validator already
|
||||
// rejected this — DeriveProvider's error here means
|
||||
// validateRegisteredModelForRuntime should have caught it. Don't
|
||||
// double-reject: pass through and let the model-side response own
|
||||
// the message).
|
||||
return true, ""
|
||||
}
|
||||
// Defense-in-depth: confirm the DERIVED provider is a known entry in the
|
||||
// providers list. By construction it should be (DeriveProvider only
|
||||
// returns a Provider that was looked up by name from `providers:`), but
|
||||
// a future federation merge could introduce a runtime ref pointing at a
|
||||
// contributed provider absent from the core catalog. Reject loudly here
|
||||
// rather than letting the save reach the agent-boot path and wedge with
|
||||
// "provider=X not in providers registry" (the original adk-demo class).
|
||||
for _, candidate := range m.Providers {
|
||||
if candidate.Name == p.Name {
|
||||
return true, ""
|
||||
}
|
||||
}
|
||||
// Build a sorted, comma-separated list of valid provider names so the
|
||||
// operator/caller sees the actionable list (the boot-time error message
|
||||
// the adk-demo class produced does NOT include this — the fix is to
|
||||
// surface it at the API boundary, where the caller can fix the request
|
||||
// without a stuck workspace + operator page).
|
||||
valid := make([]string, 0, len(m.Providers))
|
||||
for _, c := range m.Providers {
|
||||
valid = append(valid, c.Name)
|
||||
}
|
||||
sort.Strings(valid)
|
||||
return false, fmt.Sprintf(
|
||||
"derived provider %q (for model %q on runtime %q) is not in the providers registry; pick a model whose derived provider is one of: %s",
|
||||
p.Name, model, runtime, strings.Join(valid, ", "))
|
||||
}
|
||||
|
||||
@@ -6,17 +6,8 @@ package handlers
|
||||
// fail OPEN (allow) for a runtime the registry doesn't know yet (federation /
|
||||
// langgraph/etc. not in the first-party registry) so the existing knownRuntimes
|
||||
// gate stays authoritative there.
|
||||
//
|
||||
// TestValidateDerivedProviderInRegistry (issue #2172) is the provider-side
|
||||
// companion: once the model-side check passes, confirm the DERIVED provider
|
||||
// (the one the adapter will resolve at boot) is a known provider in
|
||||
// providers.yaml. Catches the adk-demo "provider=X not in providers registry"
|
||||
// class at config-SAVE time instead of letting it wedge the agent at boot.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
import "testing"
|
||||
|
||||
func TestValidateRegisteredModelForRuntime(t *testing.T) {
|
||||
type tc struct {
|
||||
@@ -79,50 +70,6 @@ func TestValidateRegisteredModelForRuntime(t *testing.T) {
|
||||
model: "",
|
||||
wantOK: true,
|
||||
},
|
||||
// ---- cp#529 routability-aware allow path -------------------------------
|
||||
{
|
||||
// BYOK passthrough id: NOT on hermes's platform menu, but the
|
||||
// openrouter name-only native arm prefix-owns it → DeriveProvider
|
||||
// resolves → ALLOWED (no platform billing — openrouter is BYOK).
|
||||
name: "byok_passthrough_routable_now_allowed",
|
||||
runtime: "hermes",
|
||||
model: "openrouter/anthropic/claude-3.5-sonnet",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
// BYOK namespaced vendor id: deepseek's widened ^deepseek[-:/]
|
||||
// matches the vendor/ form on a name-only hermes arm → allowed.
|
||||
name: "byok_namespaced_vendor_routable_now_allowed",
|
||||
runtime: "hermes",
|
||||
model: "deepseek/deepseek-chat",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
// claude-code bare GLM- BYOK id: zai name-only arm + (?i)^(glm-|…)
|
||||
// matches → DeriveProvider resolves → allowed.
|
||||
name: "claude_code_bare_glm_byok_routable_now_allowed",
|
||||
runtime: "claude-code",
|
||||
model: "GLM-4.6",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
// Genuinely UNROUTABLE id: no native hermes arm prefix-owns bare
|
||||
// gpt-4o (the platform-shared openai vendor is NOT wired into hermes
|
||||
// — billing guardrail), so DeriveProvider errors → still 422.
|
||||
name: "genuinely_unroutable_still_rejected",
|
||||
runtime: "hermes",
|
||||
model: "gpt-4o",
|
||||
wantOK: false,
|
||||
},
|
||||
{
|
||||
// A namespaced vendor id NOW routable on hermes via the dedicated
|
||||
// byok-openai provider (cp#529 BYOK-vendor arms): routes with the
|
||||
// tenant's OPENAI_API_KEY → BYOK billing, never the platform key.
|
||||
name: "byok_openai_namespaced_routable_now_allowed",
|
||||
runtime: "hermes",
|
||||
model: "openai/gpt-4o",
|
||||
wantOK: true,
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
@@ -133,163 +80,3 @@ func TestValidateRegisteredModelForRuntime(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateDerivedProviderInRegistry(t *testing.T) {
|
||||
type tc struct {
|
||||
name string
|
||||
runtime string
|
||||
model string
|
||||
wantOK bool
|
||||
// wantReasonContains: a substring the rejection reason must include
|
||||
// (skipped for OK cases). Pins the actionable list / derivation pointer
|
||||
// so the caller knows which provider was missing and what the valid
|
||||
// set looks like — this is the fix that distinguishes the new gate
|
||||
// from the boot-time "provider=X not in providers registry" string
|
||||
// it replaces.
|
||||
wantReasonContains string
|
||||
}
|
||||
cases := []tc{
|
||||
// PASS — every native (runtime, model) in the catalog derives to a
|
||||
// provider that IS in the providers list. These are the live corpus
|
||||
// entries; the test pins the registry-consistency invariant.
|
||||
{
|
||||
name: "claude_code_anthropic_api_native",
|
||||
runtime: "claude-code",
|
||||
model: "claude-sonnet-4-6",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "claude_code_kimi_coding_native",
|
||||
runtime: "claude-code",
|
||||
model: "kimi-for-coding",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "claude_code_minimax_native",
|
||||
runtime: "claude-code",
|
||||
model: "MiniMax-M2.7",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "claude_code_platform_namespaced",
|
||||
runtime: "claude-code",
|
||||
model: "moonshot/kimi-k2.6",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "codex_openai_subscription_default_arm",
|
||||
runtime: "codex",
|
||||
model: "gpt-5.5",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "codex_platform_namespaced",
|
||||
runtime: "codex",
|
||||
model: "openai/gpt-5.4-mini",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "hermes_kimi_coding",
|
||||
runtime: "hermes",
|
||||
model: "kimi-coding/kimi-k2",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "hermes_platform_namespaced",
|
||||
runtime: "hermes",
|
||||
model: "moonshot/kimi-k2.6",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "openclaw_kimi_coding",
|
||||
runtime: "openclaw",
|
||||
model: "moonshot:kimi-k2.6",
|
||||
wantOK: true,
|
||||
},
|
||||
// FAIL — model-side validator catches this, but the provider-side
|
||||
// gate is called AFTER it in Create and inherits the fail-open
|
||||
// contract for "model is not native to runtime" (DeriveProvider
|
||||
// errors → allow, letting the model-side response own the message).
|
||||
// This is the deliberate "don't double-reject" decision.
|
||||
{
|
||||
name: "unregistered_model_pass_through_to_model_side",
|
||||
runtime: "claude-code",
|
||||
model: "totally-made-up-model-xyz",
|
||||
wantOK: true, // pass-through: model-side validator owns the rejection
|
||||
},
|
||||
// Federation contract — mirror of the model-side test above.
|
||||
{
|
||||
name: "langgraph_runtime_failopen",
|
||||
runtime: "langgraph",
|
||||
model: "anything-goes",
|
||||
wantOK: true,
|
||||
},
|
||||
{
|
||||
name: "external_runtime_failopen",
|
||||
runtime: "external",
|
||||
model: "whatever",
|
||||
wantOK: true,
|
||||
},
|
||||
// Empty model — MODEL_REQUIRED owns it; allow.
|
||||
{
|
||||
name: "empty_model_allowed_other_gate_owns_it",
|
||||
runtime: "claude-code",
|
||||
model: "",
|
||||
wantOK: true,
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
ok, why := validateDerivedProviderInRegistry(c.runtime, c.model)
|
||||
if ok != c.wantOK {
|
||||
t.Errorf("validateDerivedProviderInRegistry(%q,%q) ok=%v want %v (reason=%q)",
|
||||
c.runtime, c.model, ok, c.wantOK, why)
|
||||
}
|
||||
if !c.wantOK && c.wantReasonContains != "" && !strings.Contains(why, c.wantReasonContains) {
|
||||
t.Errorf("rejection reason missing %q: got %q", c.wantReasonContains, why)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestRegistryConsistency_AllNativeModelsDeriveToKnownProvider walks every
|
||||
// (runtime, model) pair in the registry's native model sets and asserts each
|
||||
// one derives to a provider that IS in the providers list. This is the
|
||||
// static regression gate the issue calls for ("a CI test fails if any shipped
|
||||
// demo/template config references an unregistered provider") — generalized
|
||||
// to the catalog as a whole: if anyone edits providers.yaml such that a
|
||||
// `runtimes:` block names a provider absent from `providers:`, this test
|
||||
// fires before the bad config can reach a customer workspace.
|
||||
//
|
||||
// By construction this invariant should always hold (DeriveProvider only
|
||||
// returns a Provider that was looked up by name from `providers:`), so the
|
||||
// test primarily guards against future federation merges that introduce a
|
||||
// runtime ref pointing at a contributed provider absent from the core
|
||||
// catalog — exactly the failure shape the adk-demo Assistant wedge
|
||||
// belongs to.
|
||||
func TestRegistryConsistency_AllNativeModelsDeriveToKnownProvider(t *testing.T) {
|
||||
m, err := providerRegistry()
|
||||
if err != nil || m == nil {
|
||||
t.Skipf("providerRegistry unavailable in test env (err=%v); skipping consistency walk", err)
|
||||
}
|
||||
providerNames := make(map[string]struct{}, len(m.Providers))
|
||||
for _, p := range m.Providers {
|
||||
providerNames[p.Name] = struct{}{}
|
||||
}
|
||||
for runtimeName, runtime := range m.Runtimes {
|
||||
for _, ref := range runtime.Providers {
|
||||
for _, modelID := range ref.Models {
|
||||
p, err := m.DeriveProvider(runtimeName, modelID, nil)
|
||||
if err != nil {
|
||||
t.Errorf("catalog invariant broken: runtime=%q model=%q failed DeriveProvider: %v",
|
||||
runtimeName, modelID, err)
|
||||
continue
|
||||
}
|
||||
if _, ok := providerNames[p.Name]; !ok {
|
||||
t.Errorf("catalog invariant broken: runtime=%q model=%q derives to provider %q which is not in the providers list (refs=%q)",
|
||||
runtimeName, modelID, p.Name, ref.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user