Compare commits
50 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 264a924c6d | |||
| eb19bb4882 | |||
| 27015b7c75 | |||
| ee4001342d | |||
| de405023c6 | |||
| d326b474fb | |||
| 251df965e9 | |||
| d417a7e52d | |||
| fcc7934b89 | |||
| b6112a62b3 | |||
| 4e4dba3852 | |||
| f874419489 | |||
| 6b16d99655 | |||
| 2584a18862 | |||
| cbd98adc6d | |||
| b9dd026341 | |||
| 82a3f23540 | |||
| c69193ae3e | |||
| 07040361b2 | |||
| 7822105058 | |||
| b5765148a4 | |||
| a29b6c8c38 | |||
| e4d8229877 | |||
| e9dea8233b | |||
| 42f77aba28 | |||
| 6c9cc581c9 | |||
| 09b1ffb5cc | |||
| 312168aefc | |||
| c8474fdc26 | |||
| 98f08397d0 | |||
| b1c623210c | |||
| 7a80cc064a | |||
| b7282b41f8 | |||
| c8932a47a6 | |||
| 1a88e9aeac | |||
| 9fde1b5506 | |||
| a7bdb8d860 | |||
| a342a0218e | |||
| b4a7933ddb | |||
| ea43f26ea4 | |||
| 35f5b91f5d | |||
| 675ab9df83 | |||
| 42af316a84 | |||
| 3dd310bfe7 | |||
| 00d2023d9c | |||
| 9fe7eb9a8e | |||
| 44ab45720f | |||
| 4f0f7b24c3 | |||
| 7c1a856f45 | |||
| d3c18384bd |
@@ -24,6 +24,17 @@ Three failure classes:
|
||||
F3 (B) and (C) are not set-equal. Audit env wider than protection
|
||||
→ audit flags non-force-merges as force; narrower → real
|
||||
force-merges are missed.
|
||||
F4 Context in (B) is emitted by NO workflow in .gitea/workflows/ at
|
||||
all (repo-wide, case-correct generalization of F2, which only
|
||||
covers `ci / `-prefixed names). This is the inverse-of-F2 hole and
|
||||
the one that makes the `CI / all-required` aggregator's
|
||||
name-vs-coverage gap safe: `all-required` is fail-closed over CI's
|
||||
OWN jobs but CANNOT cover sibling required workflows
|
||||
(`E2E API Smoke Test`, `Handlers Postgres Integration` — Gitea has
|
||||
no cross-workflow `needs:`). F4 verifies each cross-workflow
|
||||
required context still has a live emitter, so a renamed/deleted
|
||||
sibling workflow that BP still requires is caught instead of
|
||||
degrading to a silent absent-as-pending advisory gate.
|
||||
|
||||
Idempotency:
|
||||
Searches OPEN issues by exact title prefix
|
||||
@@ -380,6 +391,68 @@ def expected_context(job_key: str, workflow_name: str = "ci") -> str:
|
||||
return f"{workflow_name} / {job_key} (pull_request)"
|
||||
|
||||
|
||||
def workflow_emitted_contexts(wf_doc: dict) -> set[str]:
|
||||
"""The set of `pull_request` status-check contexts a SINGLE workflow
|
||||
emits, computed from its real `name:` + each job's `name or key`.
|
||||
|
||||
Gitea reports a context as `{workflow.name} / {job.name|job.key}
|
||||
(pull_request)`. Unlike `expected_context()` (which hard-codes the
|
||||
lowercase literal `ci` and the bare job-KEY — a shape that does NOT
|
||||
match this repo, whose workflow is `name: CI` and whose CI jobs DO
|
||||
set per-job `name:`), this reads the authoritative names straight
|
||||
from the parsed YAML, so the contexts it produces are byte-equal to
|
||||
what BP records. Used by F4 (cross-workflow emitter existence).
|
||||
|
||||
Jobs whose `if:` gates on `github.event_name`/`github.ref` are still
|
||||
emitters on the events they DO run — they remain in the set; F4 only
|
||||
asserts *existence of an emitter*, never that it ran on a given
|
||||
trigger."""
|
||||
name = wf_doc.get("name")
|
||||
if not isinstance(name, str) or not name:
|
||||
return set()
|
||||
jobs = wf_doc.get("jobs")
|
||||
if not isinstance(jobs, dict):
|
||||
return set()
|
||||
out: set[str] = set()
|
||||
for key, spec in jobs.items():
|
||||
job_name = key
|
||||
if isinstance(spec, dict) and isinstance(spec.get("name"), str) and spec["name"]:
|
||||
job_name = spec["name"]
|
||||
out.add(f"{name} / {job_name} (pull_request)")
|
||||
return out
|
||||
|
||||
|
||||
def all_emitted_contexts(workflows_dir: str = ".gitea/workflows") -> set[str]:
|
||||
"""Union of `pull_request` contexts emitted by EVERY workflow in the
|
||||
repo. F4 uses this to assert that each BP-required
|
||||
`status_check_contexts` entry corresponds to a real emitting
|
||||
workflow+job — closing the inverse-of-F2 hole where BP requires a
|
||||
context that NO workflow produces (e.g. a sibling workflow like
|
||||
`E2E API Smoke Test` or `Handlers Postgres Integration` was renamed
|
||||
or deleted while still required, leaving BP demanding a green it can
|
||||
never receive; Gitea treats absent-as-pending → silent advisory
|
||||
gate). This is what makes the misleadingly-named `CI / all-required`
|
||||
aggregator safe at the repo level: it only covers CI's own jobs, but
|
||||
F4 guarantees the cross-workflow required contexts it CANNOT cover
|
||||
are real and present."""
|
||||
import glob as _glob
|
||||
|
||||
emitted: set[str] = set()
|
||||
for path in sorted(_glob.glob(os.path.join(workflows_dir, "*.yml"))):
|
||||
try:
|
||||
with open(path, encoding="utf-8") as f:
|
||||
doc = yaml.safe_load(f)
|
||||
except (OSError, yaml.YAMLError):
|
||||
# A single unparseable sibling workflow must not blind F4 to
|
||||
# the rest. Skip it loudly; lint-workflow-yaml gates parse
|
||||
# validity separately.
|
||||
sys.stderr.write(f"::warning::F4: could not parse {path}, skipping\n")
|
||||
continue
|
||||
if isinstance(doc, dict):
|
||||
emitted |= workflow_emitted_contexts(doc)
|
||||
return emitted
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Drift detection
|
||||
# --------------------------------------------------------------------------
|
||||
@@ -531,6 +604,36 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
+ "\n".join(f" - {c}" for c in stale_protection)
|
||||
)
|
||||
|
||||
# ----- F4: cross-workflow required context has no emitting workflow -----
|
||||
# F2 (above) is scoped to `ci / `-prefixed contexts ONLY, and built
|
||||
# from the hard-coded lowercase literal `ci` + bare job-keys — a shape
|
||||
# that does NOT match this repo (workflow is `name: CI`, jobs set their
|
||||
# own `name:`), so F2 is effectively dormant here. F4 is the
|
||||
# case-correct, REPO-WIDE generalization: it parses every workflow's
|
||||
# real `name:` + job `name|key` and asserts that EVERY BP-required
|
||||
# context is actually emitted by some workflow.
|
||||
#
|
||||
# This is the gate that makes the `CI / all-required` aggregator's
|
||||
# name-vs-coverage gap safe. `all-required` is fail-closed over CI's
|
||||
# OWN jobs but — by Gitea's design (no cross-workflow `needs:`) — it
|
||||
# CANNOT and does not cover sibling required workflows
|
||||
# (`E2E API Smoke Test`, `Handlers Postgres Integration`). Those MUST
|
||||
# be listed in BP independently. F4 verifies each such BP context
|
||||
# still has a live emitter, so the inverse-of-F2 hole — BP requires a
|
||||
# context that no workflow produces (rename/delete a sibling workflow
|
||||
# while still required → Gitea treats absent-as-pending → silent
|
||||
# advisory gate, and a red PR can look mergeable) — is caught.
|
||||
repo_emitted = all_emitted_contexts(os.path.dirname(CI_WORKFLOW_PATH))
|
||||
unemitted = sorted(c for c in contexts if c not in repo_emitted)
|
||||
if unemitted:
|
||||
findings.append(
|
||||
"F4 — branch_protections/{br}.status_check_contexts entries that "
|
||||
"NO workflow in .gitea/workflows/ emits "
|
||||
"(stale required name → silent advisory gate; a red PR can look "
|
||||
"mergeable):\n".format(br=branch)
|
||||
+ "\n".join(f" - {c}" for c in unemitted)
|
||||
)
|
||||
|
||||
# ----- F3: audit env vs protection contexts (set-equal) -----
|
||||
only_in_env = sorted(env_set - contexts)
|
||||
only_in_protection = sorted(contexts - env_set)
|
||||
@@ -556,6 +659,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
"protection_contexts": sorted(contexts),
|
||||
"audit_env_checks": sorted(env_set),
|
||||
"expected_contexts": sorted(emitted_contexts),
|
||||
"repo_emitted_contexts": sorted(repo_emitted),
|
||||
}
|
||||
return findings, debug
|
||||
|
||||
|
||||
+35
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env bash
|
||||
# Drift-prevention guard: SEV #2499 class (KI-013 container/volume naming).
|
||||
#
|
||||
# KI-013 removed 12-char UUID truncation from container/volume names.
|
||||
# E2E scripts must use FULL workspace IDs when referencing containers
|
||||
# and volumes. Any :0:12 substring-match truncation is a regression risk.
|
||||
#
|
||||
# Scans ALL .sh files under tests/e2e/ (including lib/ and subdirs).
|
||||
# Run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh
|
||||
set -euo pipefail
|
||||
|
||||
PAT=':0:12([^0-9]|$)'
|
||||
ERR=0
|
||||
|
||||
# Use find to recurse into tests/e2e subdirs (lib/, cron/, etc.)
|
||||
while IFS= read -r -d '' f; do
|
||||
MATCHES=$(grep -nE "$PAT" "$f" 2>/dev/null || true)
|
||||
if [ -n "$MATCHES" ]; then
|
||||
echo "::error::SEV-2499 drift guard: truncated workspace ID (:0:12) in E2E script"
|
||||
echo "::error::file=$f"
|
||||
echo "$MATCHES" | while read -r line; do
|
||||
echo "::error:: $line"
|
||||
done
|
||||
ERR=1
|
||||
fi
|
||||
done < <(find tests/e2e -type f -name '*.sh' -print0)
|
||||
|
||||
if [ "$ERR" -ne 0 ]; then
|
||||
echo ""
|
||||
echo "FAIL: E2E scripts use 12-char truncated IDs (:0:12)."
|
||||
echo " KI-013 requires FULL workspace IDs. Update the flagged lines."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "PASS: No truncated workspace IDs in E2E scripts."
|
||||
@@ -66,6 +66,14 @@ def build_plan(env: dict[str, str]) -> dict:
|
||||
"target_tag": target_tag,
|
||||
"soak_seconds": _int_env(env, "PROD_AUTO_DEPLOY_SOAK_SECONDS", 60, minimum=0),
|
||||
"batch_size": _int_env(env, "PROD_AUTO_DEPLOY_BATCH_SIZE", 3),
|
||||
# Tolerate a small minority of individually-stuck tenants (e.g. a wedged
|
||||
# data volume that won't recreate). They are QUARANTINED — shipped past
|
||||
# so the healthy majority still lands the build — and reported for
|
||||
# separate recovery, instead of one stuck tenant blocking the whole
|
||||
# fleet deploy. The canary still must pass, the CP halts a batch the
|
||||
# moment failures exceed this, and the cross-batch coverage gate below
|
||||
# enforces the same tolerance globally. Default 1.
|
||||
"max_stragglers": _int_env(env, "PROD_AUTO_DEPLOY_MAX_STRAGGLERS", 1, minimum=0),
|
||||
"dry_run": truthy_flag(env.get("PROD_AUTO_DEPLOY_DRY_RUN", "")),
|
||||
# confirm:true ack required by CP /cp/admin/tenants/redeploy-fleet
|
||||
# contract (cp#228 / task #308) for fleet-wide intent. Empty body
|
||||
@@ -251,26 +259,41 @@ def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]:
|
||||
return sorted(s for s in dict.fromkeys(enumerated) if s not in verified)
|
||||
|
||||
|
||||
def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None:
|
||||
"""Fail the rollout if any enumerated tenant is not on the target build.
|
||||
def assert_full_coverage(
|
||||
enumerated: list[str], aggregate: dict, dry_run: bool, max_stragglers: int = 0
|
||||
) -> None:
|
||||
"""Gate the rollout on coverage, tolerating a quarantined straggler minority.
|
||||
|
||||
This is the no-silent-skip gate (internal#724). A dry run proves
|
||||
nothing landed, so coverage is not asserted for it.
|
||||
This is the no-silent-skip gate (internal#724) made resilient: every
|
||||
enumerated tenant must be PROVEN on the target build, EXCEPT up to
|
||||
``max_stragglers`` individually-stuck tenants which are quarantined (shipped
|
||||
past) and reported for separate recovery instead of blocking the whole
|
||||
fleet deploy. Exceeding the tolerance is a systemic failure → RolloutFailed.
|
||||
A dry run proves nothing landed, so coverage is not asserted for it.
|
||||
"""
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
stragglers = rollout_stragglers(enumerated, aggregate.get("results") or [])
|
||||
if stragglers:
|
||||
if not stragglers:
|
||||
return
|
||||
# Surface the stragglers (for the step summary + recovery), gate or not.
|
||||
aggregate["stragglers"] = stragglers
|
||||
if len(stragglers) > max_stragglers:
|
||||
msg = (
|
||||
f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target "
|
||||
f"after redeploy-fleet: {', '.join(stragglers)} "
|
||||
f"after redeploy-fleet (max tolerated {max_stragglers}): {', '.join(stragglers)} "
|
||||
f"(enumerated {len(set(enumerated))})"
|
||||
)
|
||||
aggregate["ok"] = False
|
||||
aggregate["error"] = msg
|
||||
aggregate["stragglers"] = stragglers
|
||||
raise RolloutFailed(msg, aggregate)
|
||||
# Within tolerance: shipped to the healthy majority; quarantine is loud,
|
||||
# not fatal. The deploy succeeds; the stragglers need individual recovery.
|
||||
print(
|
||||
f"::warning::quarantined {len(stragglers)} straggler(s) (<= max {max_stragglers}); "
|
||||
f"shipped to the rest of the fleet — these need recovery: {', '.join(stragglers)}"
|
||||
)
|
||||
|
||||
|
||||
def execute_scoped_rollout(
|
||||
@@ -325,7 +348,8 @@ def execute_scoped_rollout(
|
||||
# or one enumerated but never batched, is a straggler. Surfacing it as
|
||||
# a RolloutFailed makes the deploy step exit non-zero instead of
|
||||
# silently reporting success (the exact agents-team failure mode).
|
||||
assert_full_coverage(all_slugs, aggregate, dry_run)
|
||||
max_stragglers = int(base_body.get("max_stragglers") or 0)
|
||||
assert_full_coverage(all_slugs, aggregate, dry_run, max_stragglers)
|
||||
|
||||
return aggregate
|
||||
|
||||
|
||||
@@ -351,7 +351,8 @@ def compute_ack_state(
|
||||
latest_directive[(user, slug)] = kind
|
||||
|
||||
# Step 2: build candidate ackers per slug.
|
||||
# Filter out self-acks and unknown slugs.
|
||||
# Filter out self-acks and unknown slugs. Author self-ack is forbidden
|
||||
# per .gitea/sop-checklist-config.yaml — a non-author peer must ack.
|
||||
ackers_per_slug: dict[str, list[str]] = {s: [] for s in items_by_slug}
|
||||
rejected_self: dict[str, list[str]] = {s: [] for s in items_by_slug}
|
||||
pending_team_check: dict[str, list[str]] = {s: [] for s in items_by_slug}
|
||||
|
||||
@@ -275,3 +275,125 @@ def test_detect_drift_no_f1_when_needs_empty_even_with_jobs():
|
||||
findings, _ = drift.detect_drift("main")
|
||||
|
||||
assert not any("F1 —" in f for f in findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# F4 — cross-workflow required-context emitter existence
|
||||
# (closes the `CI / all-required` name-vs-coverage hole: the sentinel is
|
||||
# fail-closed over CI's own jobs but CANNOT cover sibling required
|
||||
# workflows — Gitea has no cross-workflow `needs:` — so F4 guarantees each
|
||||
# BP-required context still has a live emitting workflow.)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_workflow_emitted_contexts_uses_job_name_over_key():
|
||||
"""Job `name:` wins over key; missing name falls back to key."""
|
||||
doc = {
|
||||
"name": "E2E API Smoke Test",
|
||||
"jobs": {
|
||||
"detect-changes": {}, # no name -> key
|
||||
"e2e-api": {"name": "E2E API Smoke Test"},
|
||||
},
|
||||
}
|
||||
got = drift.workflow_emitted_contexts(doc)
|
||||
assert got == {
|
||||
"E2E API Smoke Test / detect-changes (pull_request)",
|
||||
"E2E API Smoke Test / E2E API Smoke Test (pull_request)",
|
||||
}
|
||||
|
||||
|
||||
def test_workflow_emitted_contexts_empty_when_no_name():
|
||||
"""A workflow with no top-level `name:` emits nothing F4 can match."""
|
||||
assert drift.workflow_emitted_contexts({"jobs": {"x": {}}}) == set()
|
||||
|
||||
|
||||
def test_all_emitted_contexts_unions_workflow_dir(tmp_path):
|
||||
"""all_emitted_contexts globs *.yml and unions their emitter sets."""
|
||||
wf = tmp_path / "wf"
|
||||
wf.mkdir()
|
||||
(wf / "a.yml").write_text(
|
||||
"name: CI\njobs:\n all-required:\n runs-on: x\n", encoding="utf-8"
|
||||
)
|
||||
(wf / "b.yml").write_text(
|
||||
"name: Handlers Postgres Integration\n"
|
||||
"jobs:\n integration:\n name: Handlers Postgres Integration\n"
|
||||
" runs-on: x\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
got = drift.all_emitted_contexts(str(wf))
|
||||
assert "CI / all-required (pull_request)" in got
|
||||
assert "Handlers Postgres Integration / Handlers Postgres Integration (pull_request)" in got
|
||||
|
||||
|
||||
def test_all_emitted_contexts_skips_unparseable(tmp_path):
|
||||
"""A single broken sibling workflow must not blind F4 to the rest."""
|
||||
wf = tmp_path / "wf"
|
||||
wf.mkdir()
|
||||
(wf / "good.yml").write_text("name: CI\njobs:\n j:\n runs-on: x\n", encoding="utf-8")
|
||||
(wf / "bad.yml").write_text("name: [unterminated\n : : :\n", encoding="utf-8")
|
||||
got = drift.all_emitted_contexts(str(wf))
|
||||
assert "CI / j (pull_request)" in got
|
||||
|
||||
|
||||
# A BP fixture that includes the two cross-workflow required contexts.
|
||||
_BP_WITH_SIBLINGS = {
|
||||
"status_check_contexts": [
|
||||
"CI / all-required (pull_request)",
|
||||
"E2E API Smoke Test / E2E API Smoke Test (pull_request)",
|
||||
"Handlers Postgres Integration / Handlers Postgres Integration (pull_request)",
|
||||
]
|
||||
}
|
||||
|
||||
# The matching set of repo-wide emitted contexts (what a correct repo produces).
|
||||
_EMITTED_OK = {
|
||||
"CI / all-required (pull_request)",
|
||||
"E2E API Smoke Test / E2E API Smoke Test (pull_request)",
|
||||
"Handlers Postgres Integration / Handlers Postgres Integration (pull_request)",
|
||||
}
|
||||
|
||||
|
||||
def test_detect_drift_f4_silent_when_all_contexts_emitted():
|
||||
"""No F4 when every BP context has a live emitting workflow."""
|
||||
ci = _make_ci_doc({"all-required": {}})
|
||||
audit = _make_audit_doc(sorted(_BP_WITH_SIBLINGS["status_check_contexts"]))
|
||||
with patch.object(drift, "load_yaml", side_effect=[ci, audit]):
|
||||
with patch.object(drift, "api", return_value=(200, _BP_WITH_SIBLINGS)):
|
||||
with patch.object(drift, "all_emitted_contexts", return_value=set(_EMITTED_OK)):
|
||||
findings, debug = drift.detect_drift("main")
|
||||
assert not any("F4 —" in f for f in findings)
|
||||
assert debug["repo_emitted_contexts"] == sorted(_EMITTED_OK)
|
||||
|
||||
|
||||
def test_detect_drift_f4_fires_on_stale_cross_workflow_context():
|
||||
"""The core gate-hole regression: BP requires a cross-workflow context
|
||||
(e.g. a renamed/deleted sibling workflow) that NO workflow emits.
|
||||
F4 must fire — this is the inverse-of-F2 hole that makes a red PR look
|
||||
mergeable if BP is ever trimmed/renamed around `CI / all-required`."""
|
||||
ci = _make_ci_doc({"all-required": {}})
|
||||
audit = _make_audit_doc(sorted(_BP_WITH_SIBLINGS["status_check_contexts"]))
|
||||
# Handlers workflow got renamed -> its OLD BP context now has no emitter.
|
||||
emitted_after_rename = {
|
||||
"CI / all-required (pull_request)",
|
||||
"E2E API Smoke Test / E2E API Smoke Test (pull_request)",
|
||||
# Handlers context absent (renamed away)
|
||||
}
|
||||
with patch.object(drift, "load_yaml", side_effect=[ci, audit]):
|
||||
with patch.object(drift, "api", return_value=(200, _BP_WITH_SIBLINGS)):
|
||||
with patch.object(drift, "all_emitted_contexts", return_value=emitted_after_rename):
|
||||
findings, _ = drift.detect_drift("main")
|
||||
assert any("F4 —" in f for f in findings)
|
||||
assert any("Handlers Postgres Integration" in f for f in findings)
|
||||
|
||||
|
||||
def test_detect_drift_f4_catches_all_required_only_trim():
|
||||
"""If BP is trimmed to JUST `CI / all-required` but E2E/Handlers are
|
||||
still real workflows, F4 does NOT fire (no stale context) — but F3b
|
||||
(env vs BP) / operator policy must keep them required. This asserts F4
|
||||
does not false-positive on a correctly-emitted lone context."""
|
||||
bp = {"status_check_contexts": ["CI / all-required (pull_request)"]}
|
||||
ci = _make_ci_doc({"all-required": {}})
|
||||
audit = _make_audit_doc(["CI / all-required (pull_request)"])
|
||||
with patch.object(drift, "load_yaml", side_effect=[ci, audit]):
|
||||
with patch.object(drift, "api", return_value=(200, bp)):
|
||||
with patch.object(drift, "all_emitted_contexts", return_value=set(_EMITTED_OK)):
|
||||
findings, _ = drift.detect_drift("main")
|
||||
assert not any("F4 —" in f for f in findings)
|
||||
|
||||
@@ -35,6 +35,9 @@ def test_build_plan_defaults_to_staging_sha_target_and_prod_cp():
|
||||
"canary_slug": "hongming",
|
||||
"soak_seconds": 60,
|
||||
"batch_size": 3,
|
||||
# quarantine up to 1 individually-stuck tenant rather than blocking the
|
||||
# whole fleet deploy (default).
|
||||
"max_stragglers": 1,
|
||||
"dry_run": False,
|
||||
# cp#228 / task #308: fleet-wide intent must carry confirm:true.
|
||||
"confirm": True,
|
||||
@@ -470,6 +473,72 @@ def test_scoped_rollout_passes_when_all_tenants_verified_on_target():
|
||||
assert "stragglers" not in aggregate
|
||||
|
||||
|
||||
def test_scoped_rollout_quarantines_straggler_within_tolerance():
|
||||
# reno-stars never verifies on target; max_stragglers=1 tolerates it — the
|
||||
# rollout still succeeds (ships to the healthy majority) and reports the
|
||||
# quarantined straggler instead of failing the whole deploy.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [
|
||||
{"slug": s, "verified_on_target": (s != "reno-stars")}
|
||||
for s in body["only_slugs"]
|
||||
],
|
||||
}
|
||||
|
||||
aggregate = prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
"max_stragglers": 1,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
assert aggregate["ok"] is True
|
||||
assert aggregate["stragglers"] == ["reno-stars"]
|
||||
|
||||
|
||||
def test_scoped_rollout_fails_when_stragglers_exceed_tolerance():
|
||||
# Two tenants never verify; with max_stragglers=1 that is systemic → fail.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [
|
||||
{"slug": s, "verified_on_target": (s == "hongming")}
|
||||
for s in body["only_slugs"]
|
||||
],
|
||||
}
|
||||
|
||||
try:
|
||||
prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
"max_stragglers": 1,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
raise AssertionError("expected RolloutFailed when stragglers exceed tolerance")
|
||||
except prod.RolloutFailed as exc:
|
||||
assert "max tolerated 1" in str(exc)
|
||||
|
||||
|
||||
def test_scoped_rollout_dry_run_does_not_assert_coverage():
|
||||
# A dry run proves nothing landed; coverage must NOT be asserted or
|
||||
# every plan would fail.
|
||||
|
||||
@@ -291,7 +291,8 @@ class TestComputeAckState(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
|
||||
|
||||
def test_self_ack_rejected(self):
|
||||
def test_self_ack_rejected_when_author_in_team(self):
|
||||
# Author self-acks are forbidden — a non-author peer must ack.
|
||||
comments = [_comment("alice", "/sop-ack comprehensive-testing")]
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, self._approve_all
|
||||
@@ -722,16 +723,16 @@ class TestRootCauseAckEligibilityWidened(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(state["root-cause"]["ackers"], ["hongming"])
|
||||
|
||||
def test_self_ack_still_forbidden_even_with_widened_eligibility(self):
|
||||
# Author cannot self-ack — widening teams must NOT weaken
|
||||
# the non-author rule.
|
||||
def test_self_ack_rejected_with_widened_eligibility(self):
|
||||
# Author self-acks are forbidden even when the author is in the
|
||||
# required team — a non-author peer must ack.
|
||||
comments = [_comment("alice", "/sop-ack root-cause")]
|
||||
probe = self._approve_only({"alice"})
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe, high_risk=False
|
||||
)
|
||||
self.assertEqual(state["root-cause"]["ackers"], [])
|
||||
self.assertIn("alice", state["root-cause"]["rejected"]["self_ack"])
|
||||
self.assertEqual(state["root-cause"]["rejected"]["self_ack"], ["alice"])
|
||||
|
||||
|
||||
class TestHighRiskClassUsesElevatedListInConfig(unittest.TestCase):
|
||||
|
||||
@@ -394,6 +394,14 @@ jobs:
|
||||
# a revert of the zero-validated→RED logic goes red on every PR.
|
||||
bash tests/e2e/test_require_live_priority_gate_unit.sh
|
||||
|
||||
- if: ${{ needs.changes.outputs.scripts == 'true' }}
|
||||
name: Drift guard — KI-013 container/volume naming (SEV #2499)
|
||||
# KI-013 removed 12-char UUID truncation from container/volume names.
|
||||
# E2E scripts must use FULL workspace IDs. This fail-closed guard
|
||||
# prevents regressions where a new/modified script reintroduces the
|
||||
# old truncated-name pattern (the root cause of SEV #2499).
|
||||
run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh
|
||||
|
||||
- if: ${{ needs.changes.outputs.scripts == 'true' }}
|
||||
name: Test ECR promote-tenant-image script (mock-driven, no live infra)
|
||||
# Covers scripts/promote-tenant-image.sh — the codified
|
||||
@@ -404,6 +412,14 @@ jobs:
|
||||
run: |
|
||||
bash scripts/test-promote-tenant-image.sh
|
||||
|
||||
- if: ${{ needs.changes.outputs.scripts == 'true' }}
|
||||
name: Drift guard — KI-013 container/volume naming (SEV #2499)
|
||||
# KI-013 removed 12-char UUID truncation from container/volume names.
|
||||
# E2E scripts must use FULL workspace IDs. This fail-closed guard
|
||||
# prevents regressions where a new/modified script reintroduces the
|
||||
# old truncated-name pattern (the root cause of SEV #2499).
|
||||
run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh
|
||||
|
||||
- if: ${{ needs.changes.outputs.scripts == 'true' }}
|
||||
name: Shellcheck promote-tenant-image script
|
||||
# scripts/ is excluded from the bulk shellcheck pass above (legacy
|
||||
@@ -500,6 +516,27 @@ jobs:
|
||||
all-required:
|
||||
# Aggregator sentinel — RFC internal#219 §2 (Phase 4 — closes internal#286).
|
||||
#
|
||||
# ── SCOPE (read before trusting the name) ─────────────────────────
|
||||
# "all-required" means "all of THIS workflow's (CI's) required jobs"
|
||||
# — NOT "all of branch-protection's required checks". It is fail-
|
||||
# closed over its `needs:` (the CI jobs below), but Gitea Actions has
|
||||
# NO cross-workflow `needs:`, so this sentinel STRUCTURALLY CANNOT and
|
||||
# does not cover sibling required workflows that live in their own
|
||||
# files — notably:
|
||||
# • `E2E API Smoke Test` (.gitea/workflows/e2e-api.yml)
|
||||
# • `Handlers Postgres Integration`(.gitea/workflows/handlers-postgres-integration.yml)
|
||||
# Those emit their OWN status contexts and MUST be listed in branch
|
||||
# protection `status_check_contexts` INDEPENDENTLY of this sentinel.
|
||||
# They are today; do NOT trim BP down to just `CI / all-required` on
|
||||
# the assumption that it covers them — it does not, and a red E2E /
|
||||
# Handlers run would then look mergeable (observed: core PR #1086 @
|
||||
# 9136d05a — `CI / all-required` green while E2E (id 48) + Handlers
|
||||
# (id 47) were red; not exploitable only because BP still requires
|
||||
# all three). The cross-workflow coverage is enforced separately by
|
||||
# ci-required-drift.py's F4 check (every BP context must have a live
|
||||
# emitting workflow), which is what keeps this name honest.
|
||||
# ──────────────────────────────────────────────────────────────────
|
||||
#
|
||||
# Emits `CI / all-required (<event>)` where <event> is the workflow trigger
|
||||
# (e.g. `CI / all-required (pull_request)`, `CI / all-required (push)`).
|
||||
# Branch protection requires the event-suffixed name —
|
||||
|
||||
@@ -165,6 +165,28 @@ jobs:
|
||||
cache: 'npm'
|
||||
cache-dependency-path: canvas/package-lock.json
|
||||
|
||||
- name: Sweep stale e2e-chat testcontainers (self-heal prior leaks)
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
run: |
|
||||
# Prior e2e-chat runs that were cancelled/killed — or whose always()
|
||||
# cleanup hit a wedged docker daemon — leak their pg-/redis-e2e-chat-*
|
||||
# containers, which then pile up on the shared runner host (observed: 13
|
||||
# such containers, up to 2 weeks old, on the operator daemon). Reap any
|
||||
# e2e-chat container older than the job window so leaks self-heal every
|
||||
# run instead of relying on each run's own cleanup succeeding. Age-based
|
||||
# (>2h, well beyond the 15m job) so a CONCURRENT e2e-chat job's fresh
|
||||
# containers are never touched. See controlplane#646.
|
||||
now=$(date -u +%s)
|
||||
docker ps -a --filter name=e2e-chat --format '{{.Names}}' | while read -r c; do
|
||||
[ -n "$c" ] || continue
|
||||
created=$(docker inspect -f '{{.Created}}' "$c" 2>/dev/null) || continue
|
||||
cts=$(date -u -d "$created" +%s 2>/dev/null) || continue
|
||||
if [ $(( now - cts )) -gt 7200 ]; then
|
||||
echo "sweeping stale e2e-chat container $c (created $created)"
|
||||
timeout 30 docker rm -f "$c" >/dev/null 2>&1 || true
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Start Postgres (docker)
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
run: |
|
||||
@@ -430,5 +452,7 @@ jobs:
|
||||
- name: Stop service containers
|
||||
if: always() && needs.detect-changes.outputs.chat == 'true'
|
||||
run: |
|
||||
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
|
||||
# timeout-wrap so a wedged docker daemon can't hang this always() step
|
||||
# (a hung rm here is one way containers leak in the first place).
|
||||
timeout 30 docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
timeout 30 docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
|
||||
|
||||
@@ -78,6 +78,12 @@ jobs:
|
||||
# even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
|
||||
MOLECULE_ENV: development
|
||||
SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
|
||||
# act_runner runs the job inside a Docker container, so /.dockerenv exists
|
||||
# and the platform auto-detects platformInDocker=true. But the job container
|
||||
# is NOT on molecule-core-net, so it cannot resolve workspace container
|
||||
# hostnames (ws-<id>:8000). Force false so the proxy keeps using the
|
||||
# host-mapped 127.0.0.1:<ephemeral_port> URL, which IS reachable.
|
||||
MOLECULE_IN_DOCKER: false
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
@@ -132,7 +138,29 @@ jobs:
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://127.0.0.1:${PORT}" >> "$GITHUB_ENV"
|
||||
# Discover an IP that Docker containers can use to reach the host platform.
|
||||
# host.docker.internal is not reliably available on Linux (act_runner), so
|
||||
# workspace containers cannot resolve it and fail to register/heartbeat.
|
||||
# Workspace containers join molecule-core-net; the host is reachable via that
|
||||
# network's gateway. Ensure the network exists first (the provisioner creates
|
||||
# it lazily, but we need the gateway BEFORE starting the platform).
|
||||
docker network inspect molecule-core-net >/dev/null 2>&1 || docker network create molecule-core-net >/dev/null
|
||||
# Parse Gateway from raw JSON because --format '{{.IPAM.Config}}' is
|
||||
# inconsistent across Docker versions (sometimes omits Gateway field).
|
||||
PLATFORM_HOST_IP=$(docker network inspect molecule-core-net 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(docker network inspect bridge 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(ip route | awk '/default/ {print $3}' | head -1 || true)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
echo "::error::Could not determine PLATFORM_HOST_IP for Docker containers to reach the platform"
|
||||
exit 1
|
||||
fi
|
||||
echo "PLATFORM_HOST_IP=${PLATFORM_HOST_IP}"
|
||||
echo "PLATFORM_URL=http://${PLATFORM_HOST_IP}:${PORT}" >> "$GITHUB_ENV"
|
||||
# Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
|
||||
# bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
|
||||
T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
@@ -173,8 +201,10 @@ jobs:
|
||||
run: |
|
||||
# Bind to the dynamically allocated port (see #2450).
|
||||
# DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
|
||||
# $GITHUB_ENV.
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
# $GITHUB_ENV. PLATFORM_URL is also passed explicitly because
|
||||
# $GITHUB_ENV propagation can be flaky on act_runner (#2468 RCA).
|
||||
echo "starting platform with PLATFORM_URL=${PLATFORM_URL:-<fallback>} PORT=$PORT BIND_ADDR=0.0.0.0"
|
||||
PORT=$PORT BIND_ADDR=0.0.0.0 PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:$PORT}" ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
@@ -198,6 +228,11 @@ jobs:
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Verify platform reachable from molecule-core-net
|
||||
run: |
|
||||
echo "Testing platform reachability from molecule-core-net container..."
|
||||
docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- http://${PLATFORM_URL#http://}/health" || echo "WARN: platform not reachable from molecule-core-net"
|
||||
|
||||
- name: Run local-provision lifecycle E2E (stub — REQUIRED)
|
||||
run: bash tests/e2e/test_local_provision_lifecycle_e2e.sh
|
||||
|
||||
@@ -205,6 +240,15 @@ jobs:
|
||||
if: failure()
|
||||
run: cat workspace-server/platform.log || true
|
||||
|
||||
- name: Dump workspace container logs on failure
|
||||
if: failure()
|
||||
run: |
|
||||
WS_NAME=$(docker ps --filter "name=ws-" --format '{{.Names}}' | head -1 || true)
|
||||
if [ -n "$WS_NAME" ]; then
|
||||
echo "=== Workspace container logs for $WS_NAME ==="
|
||||
docker logs "$WS_NAME" 2>&1 | tail -n 80 || true
|
||||
fi
|
||||
|
||||
- name: Stop platform
|
||||
if: always()
|
||||
run: |
|
||||
@@ -248,6 +292,12 @@ jobs:
|
||||
# even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
|
||||
MOLECULE_ENV: development
|
||||
SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
|
||||
# act_runner runs the job inside a Docker container, so /.dockerenv exists
|
||||
# and the platform auto-detects platformInDocker=true. But the job container
|
||||
# is NOT on molecule-core-net, so it cannot resolve workspace container
|
||||
# hostnames (ws-<id>:8000). Force false so the proxy keeps using the
|
||||
# host-mapped 127.0.0.1:<ephemeral_port> URL, which IS reachable.
|
||||
MOLECULE_IN_DOCKER: false
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
@@ -297,7 +347,29 @@ jobs:
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://127.0.0.1:${PORT}" >> "$GITHUB_ENV"
|
||||
# Discover an IP that Docker containers can use to reach the host platform.
|
||||
# host.docker.internal is not reliably available on Linux (act_runner), so
|
||||
# workspace containers cannot resolve it and fail to register/heartbeat.
|
||||
# Workspace containers join molecule-core-net; the host is reachable via that
|
||||
# network's gateway. Ensure the network exists first (the provisioner creates
|
||||
# it lazily, but we need the gateway BEFORE starting the platform).
|
||||
docker network inspect molecule-core-net >/dev/null 2>&1 || docker network create molecule-core-net >/dev/null
|
||||
# Parse Gateway from raw JSON because --format '{{.IPAM.Config}}' is
|
||||
# inconsistent across Docker versions (sometimes omits Gateway field).
|
||||
PLATFORM_HOST_IP=$(docker network inspect molecule-core-net 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(docker network inspect bridge 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(ip route | awk '/default/ {print $3}' | head -1 || true)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
echo "::error::Could not determine PLATFORM_HOST_IP for Docker containers to reach the platform"
|
||||
exit 1
|
||||
fi
|
||||
echo "PLATFORM_HOST_IP=${PLATFORM_HOST_IP}"
|
||||
echo "PLATFORM_URL=http://${PLATFORM_HOST_IP}:${PORT}" >> "$GITHUB_ENV"
|
||||
T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
@@ -329,7 +401,8 @@ jobs:
|
||||
- name: Start platform (background)
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
echo "starting platform with PLATFORM_URL=${PLATFORM_URL:-<fallback>} PORT=$PORT BIND_ADDR=0.0.0.0"
|
||||
PORT=$PORT BIND_ADDR=0.0.0.0 PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:$PORT}" ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
@@ -351,6 +424,11 @@ jobs:
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Verify platform reachable from molecule-core-net
|
||||
run: |
|
||||
echo "Testing platform reachability from molecule-core-net container..."
|
||||
docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- http://${PLATFORM_URL#http://}/health" || echo "WARN: platform not reachable from molecule-core-net"
|
||||
|
||||
- name: Run local-provision lifecycle E2E (real image + MiniMax LLM — ADVISORY)
|
||||
env:
|
||||
# LIFECYCLE_LLM=minimax: provision the REAL claude-code template image
|
||||
@@ -375,6 +453,15 @@ jobs:
|
||||
if: failure()
|
||||
run: cat workspace-server/platform.log || true
|
||||
|
||||
- name: Dump workspace container logs on failure
|
||||
if: failure()
|
||||
run: |
|
||||
WS_NAME=$(docker ps --filter "name=ws-" --format '{{.Names}}' | head -1 || true)
|
||||
if [ -n "$WS_NAME" ]; then
|
||||
echo "=== Workspace container logs for $WS_NAME ==="
|
||||
docker logs "$WS_NAME" 2>&1 | tail -n 80 || true
|
||||
fi
|
||||
|
||||
- name: Stop platform
|
||||
if: always()
|
||||
run: |
|
||||
|
||||
@@ -190,6 +190,26 @@ jobs:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
|
||||
|
||||
# Docker layer cache (registry-backed). Each builder created by
|
||||
# setup-buildx-action is an EPHEMERAL docker-container builder (fresh
|
||||
# buildkit state every run), so without an external cache every main
|
||||
# push rebuilds `go mod download` / npm layers from scratch — this job
|
||||
# class is the slowest in CI (p50 228s, ~175 runs/wk). We export the
|
||||
# build cache to a dedicated moving ECR tag (`:buildcache`, never a
|
||||
# deploy tag) and import it on the next run, regardless of which
|
||||
# runner/builder picks the job.
|
||||
# - mode=max: caches intermediate (builder-stage) layers too — the
|
||||
# final stage is a tiny alpine/distroless copy, so min mode would
|
||||
# cache nothing useful.
|
||||
# - image-manifest=true,oci-mediatypes=true: required for ECR, which
|
||||
# rejects the raw buildkit cache-manifest mediatype. Verified by a
|
||||
# real export+import round-trip against ECR on the publish host
|
||||
# (2026-06-09) before this change.
|
||||
# - ignore-error=true on cache-to: a cache EXPORT failure must never
|
||||
# fail the publish lane; worst case the next run is cold.
|
||||
# - cache-from on a missing tag (first run) is a warning, not an error.
|
||||
# - Concurrent publishes overwrite :buildcache last-writer-wins —
|
||||
# same best-effort semantics as :staging-latest.
|
||||
- name: Build & push platform image to ECR (staging-<sha> + staging-latest)
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
@@ -212,6 +232,8 @@ jobs:
|
||||
--label "org.opencontainers.image.revision=${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
||||
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
|
||||
--cache-from "type=registry,ref=${IMAGE_NAME}:buildcache" \
|
||||
--cache-to "type=registry,ref=${IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
|
||||
--tag "${IMAGE_NAME}:${TAG_SHA}" \
|
||||
--tag "${IMAGE_NAME}:${TAG_LATEST}" \
|
||||
--push .
|
||||
@@ -251,6 +273,11 @@ jobs:
|
||||
# Retry loop: buildkit EOF (internal#2468) is often transient on the
|
||||
# publish runner under memory pressure. Up to 3 attempts with a fresh
|
||||
# builder each time so a crashed buildkit doesn't poison the next try.
|
||||
# Registry layer cache (see platform-image step comment for the full
|
||||
# rationale): the fresh-builder-per-attempt pattern means there is
|
||||
# NEVER local cache here — cache-from gives retries AND the next run
|
||||
# a warm start. Cache lives on the PRIMARY ECR only (the staging
|
||||
# mirror is a push target, not a cache source).
|
||||
for attempt in 1 2 3; do
|
||||
echo "::notice::Tenant image build attempt ${attempt}/3 ..."
|
||||
builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}"
|
||||
@@ -264,6 +291,8 @@ jobs:
|
||||
--label "org.opencontainers.image.revision=${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
||||
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
|
||||
--cache-from "type=registry,ref=${TENANT_IMAGE_NAME}:buildcache" \
|
||||
--cache-to "type=registry,ref=${TENANT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
|
||||
"${build_tags[@]}" \
|
||||
--push .; then
|
||||
docker buildx rm "${builder}" >/dev/null 2>&1 || true
|
||||
@@ -530,7 +559,20 @@ jobs:
|
||||
STALE_COUNT=0
|
||||
UNREACHABLE_COUNT=0
|
||||
UNHEALTHY_COUNT=0
|
||||
QUARANTINED_COUNT=0
|
||||
# Quarantined stragglers: the CP shipped the build to the healthy
|
||||
# majority and quarantined a small minority within tolerance
|
||||
# (max_stragglers). They are reported + recovered SEPARATELY, so they
|
||||
# must not red the strict per-tenant verify — otherwise one stuck
|
||||
# tenant blocks the whole deploy, the all-or-nothing trap this fixes.
|
||||
STRAGGLERS_LIST="$(jq -r '(.stragglers // [])[]' "$RESP" 2>/dev/null || true)"
|
||||
is_straggler() { printf '%s\n' "$STRAGGLERS_LIST" | grep -qxF "$1"; }
|
||||
for slug in "${SLUGS[@]}"; do
|
||||
if is_straggler "$slug"; then
|
||||
echo "::warning::$slug is a QUARANTINED straggler — build shipped to the rest of the fleet; this tenant needs individual recovery. Skipping strict verify."
|
||||
QUARANTINED_COUNT=$((QUARANTINED_COUNT + 1))
|
||||
continue
|
||||
fi
|
||||
healthz_ok="$(jq -r --arg slug "$slug" '.results[]? | select(.slug == $slug) | .healthz_ok' "$RESP" | tail -1)"
|
||||
if [ "$healthz_ok" != "true" ]; then
|
||||
echo "::error::$slug did not report healthz_ok=true in redeploy-fleet response."
|
||||
@@ -580,6 +622,7 @@ jobs:
|
||||
echo "Stale tenants: $STALE_COUNT"
|
||||
echo "Unhealthy tenants: $UNHEALTHY_COUNT"
|
||||
echo "Unreachable tenants: $UNREACHABLE_COUNT"
|
||||
echo "Quarantined stragglers (shipped past; need recovery): $QUARANTINED_COUNT"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then
|
||||
|
||||
@@ -40,14 +40,24 @@ export function FlightEnvelope({
|
||||
if (!el || typeof el.animate !== "function") return;
|
||||
const dx = to.x - from.x;
|
||||
const dy = to.y - from.y;
|
||||
// Launch small from the source dot, GROW BIG as it crosses the gap (peak
|
||||
// mid-flight), then SHRINK small as it lands on the target dot — reads as an
|
||||
// envelope flung from one agent and received by the other. translate tracks
|
||||
// the straight path (fraction == keyframe offset); scale arcs independently.
|
||||
const at = (frac: number, scale: number, opacity: number, offset?: number) => ({
|
||||
transform: `translate(-50%,-50%) translate(${dx * frac}px,${dy * frac}px) scale(${scale})`,
|
||||
opacity,
|
||||
...(offset === undefined ? {} : { offset }),
|
||||
});
|
||||
const anim = el.animate(
|
||||
[
|
||||
{ transform: "translate(-50%,-50%) translate(0px,0px) scale(0.45)", opacity: 0 },
|
||||
{ opacity: 1, offset: 0.16 },
|
||||
{ opacity: 1, offset: 0.8 },
|
||||
{ transform: `translate(-50%,-50%) translate(${dx}px,${dy}px) scale(1)`, opacity: 0 },
|
||||
at(0, 0.5, 0),
|
||||
at(0.2, 1.25, 1, 0.2), // faded in + grown
|
||||
at(0.5, 1.7, 1, 0.5), // BIG at mid-flight
|
||||
at(0.82, 1.05, 1, 0.82), // shrinking on approach
|
||||
at(1, 0.5, 0), // small + faded out, arrived on the target dot
|
||||
],
|
||||
{ duration: FLIGHT_DURATION_MS, easing: "cubic-bezier(0.45, 0, 0.25, 1)", fill: "forwards" },
|
||||
{ duration: FLIGHT_DURATION_MS, easing: "ease-in-out", fill: "forwards" },
|
||||
);
|
||||
return () => anim.cancel();
|
||||
}, [from.x, from.y, to.x, to.y]);
|
||||
|
||||
@@ -4,17 +4,25 @@
|
||||
* Mounted INSIDE <ReactFlow> so its ViewportPortal places the envelope in flow
|
||||
* coordinates; it therefore pans and zooms with the canvas for free. The
|
||||
* flight lifecycle (which events become envelopes, reduced-motion opt-out,
|
||||
* expiry) lives in useA2AFlights — this component only resolves node centres
|
||||
* and renders. */
|
||||
import { ViewportPortal, type Node } from "@xyflow/react";
|
||||
* expiry) lives in useA2AFlights — this component only resolves endpoints and
|
||||
* renders.
|
||||
*
|
||||
* Endpoints anchor on each workspace's STATUS DOT (the green/glowing presence
|
||||
* indicator), not the card's geometric centre — so an envelope visibly leaves
|
||||
* the source agent's dot and lands on the target agent's dot. The dot carries
|
||||
* `data-flight-anchor`; we read its rendered rect and convert screen→flow via
|
||||
* React Flow, falling back to the card centre only when the dot isn't in the
|
||||
* DOM yet (node just mounted / scrolled out). */
|
||||
import { useRef } from "react";
|
||||
import { ViewportPortal, useReactFlow, type Node } from "@xyflow/react";
|
||||
import { useCanvasStore } from "@/store/canvas";
|
||||
import { useA2AFlights } from "@/hooks/useA2AFlights";
|
||||
import { useA2AFlights, type A2AFlight } from "@/hooks/useA2AFlights";
|
||||
import { FlightEnvelope, type Point } from "./FlightEnvelope";
|
||||
import type { WorkspaceNodeData } from "@/store/canvas";
|
||||
|
||||
// Fallback node footprint when React Flow has not measured a node yet. Matches
|
||||
// WorkspaceNode's leaf size (w-[300px] min-h-[176px]); a slightly-off centre
|
||||
// for the first frame after mount is invisible at flight scale.
|
||||
// WorkspaceNode's leaf size (w-[300px] min-h-[176px]); a slightly-off centre for
|
||||
// the first frame after mount is invisible at flight scale.
|
||||
const DEFAULT_W = 300;
|
||||
const DEFAULT_H = 176;
|
||||
|
||||
@@ -24,23 +32,76 @@ function nodeCenter(n: Node<WorkspaceNodeData>): Point {
|
||||
return { x: n.position.x + w / 2, y: n.position.y + h / 2 };
|
||||
}
|
||||
|
||||
/** Resolve a node's status-dot centre in FLOW coordinates. Reads the dot's
|
||||
* rendered screen rect (it carries data-flight-anchor) and converts it back to
|
||||
* flow space, so the anchor is exact regardless of pan/zoom and survives any
|
||||
* header-layout change. Falls back to the card centre when the dot isn't
|
||||
* rendered. */
|
||||
function dotAnchor(
|
||||
n: Node<WorkspaceNodeData>,
|
||||
screenToFlowPosition: (p: Point) => Point,
|
||||
): Point {
|
||||
if (typeof document !== "undefined") {
|
||||
const id =
|
||||
typeof CSS !== "undefined" && typeof CSS.escape === "function" ? CSS.escape(n.id) : n.id;
|
||||
const el = document.querySelector<HTMLElement>(
|
||||
`.react-flow__node[data-id="${id}"] [data-flight-anchor]`,
|
||||
);
|
||||
if (el) {
|
||||
const r = el.getBoundingClientRect();
|
||||
if (r.width > 0 && r.height > 0) {
|
||||
return screenToFlowPosition({ x: r.left + r.width / 2, y: r.top + r.height / 2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
return nodeCenter(n);
|
||||
}
|
||||
|
||||
/** One flight. Captures the source/target dot anchors ONCE on mount (a ref, not
|
||||
* per-render) so a pan/zoom or re-render mid-flight doesn't restart the
|
||||
* animation — mirrors HomeFlight's capture-once contract. */
|
||||
function CanvasFlight({
|
||||
flight,
|
||||
nodes,
|
||||
screenToFlowPosition,
|
||||
}: {
|
||||
flight: A2AFlight;
|
||||
nodes: Node<WorkspaceNodeData>[];
|
||||
screenToFlowPosition: (p: Point) => Point;
|
||||
}) {
|
||||
const pos = useRef<{ from: Point; to: Point } | null>(null);
|
||||
if (pos.current === null) {
|
||||
const src = nodes.find((n) => n.id === flight.sourceId);
|
||||
const dst = nodes.find((n) => n.id === flight.targetId);
|
||||
// Both endpoints must be on-canvas to draw a path between them.
|
||||
if (src && dst) {
|
||||
pos.current = {
|
||||
from: dotAnchor(src, screenToFlowPosition),
|
||||
to: dotAnchor(dst, screenToFlowPosition),
|
||||
};
|
||||
}
|
||||
}
|
||||
if (!pos.current) return null;
|
||||
return <FlightEnvelope from={pos.current.from} to={pos.current.to} kind={flight.kind} />;
|
||||
}
|
||||
|
||||
export function MessageFlightLayer() {
|
||||
const flights = useA2AFlights();
|
||||
const nodes = useCanvasStore((s) => s.nodes);
|
||||
const nodes = useCanvasStore((s) => s.nodes) as Node<WorkspaceNodeData>[];
|
||||
const { screenToFlowPosition } = useReactFlow();
|
||||
|
||||
if (flights.length === 0) return null;
|
||||
|
||||
return (
|
||||
<ViewportPortal>
|
||||
{flights.map((f) => {
|
||||
const src = nodes.find((n) => n.id === f.sourceId);
|
||||
const dst = nodes.find((n) => n.id === f.targetId);
|
||||
// Both endpoints must be on-canvas to draw a path between them.
|
||||
if (!src || !dst) return null;
|
||||
return (
|
||||
<FlightEnvelope key={f.key} from={nodeCenter(src)} to={nodeCenter(dst)} kind={f.kind} />
|
||||
);
|
||||
})}
|
||||
{flights.map((f) => (
|
||||
<CanvasFlight
|
||||
key={f.key}
|
||||
flight={f}
|
||||
nodes={nodes}
|
||||
screenToFlowPosition={screenToFlowPosition}
|
||||
/>
|
||||
))}
|
||||
</ViewportPortal>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -215,7 +215,7 @@ export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>)
|
||||
{/* Header row */}
|
||||
<div className="flex items-center justify-between gap-2 mb-2.5">
|
||||
<div className="flex items-center gap-2.5 min-w-0">
|
||||
<div className={`w-2.5 h-2.5 rounded-full shrink-0 ${statusCfg.dot} ${statusCfg.glow} shadow-sm`} />
|
||||
<div data-flight-anchor className={`w-2.5 h-2.5 rounded-full shrink-0 ${statusCfg.dot} ${statusCfg.glow} shadow-sm`} />
|
||||
<span className="text-[15px] font-semibold text-ink truncate leading-tight">
|
||||
{data.name}
|
||||
</span>
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
// @vitest-environment jsdom
|
||||
/**
|
||||
* Tests for FlightEnvelope — the envelope that animates from `from` to `to`.
|
||||
*
|
||||
* Locks the render contract the canvas + concierge-home both depend on:
|
||||
* - the envelope is positioned at the `from` point (its launch anchor),
|
||||
* - it is coloured by activity kind,
|
||||
* - it degrades gracefully when Element.animate is unavailable (jsdom / SSR).
|
||||
*
|
||||
* The grow→shrink scale arc itself uses the Web Animations API, which jsdom
|
||||
* does not implement, so we assert the static render + graceful degradation
|
||||
* rather than keyframe values.
|
||||
*/
|
||||
import React from "react";
|
||||
import { render, cleanup } from "@testing-library/react";
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { FlightEnvelope } from "../FlightEnvelope";
|
||||
|
||||
afterEach(cleanup);
|
||||
|
||||
describe("FlightEnvelope", () => {
|
||||
it("positions the envelope at the `from` launch point", () => {
|
||||
const { getByTestId } = render(
|
||||
<FlightEnvelope from={{ x: 120, y: 240 }} to={{ x: 400, y: 60 }} kind="send" />,
|
||||
);
|
||||
const el = getByTestId("flight-envelope");
|
||||
expect(el.style.left).toBe("120px");
|
||||
expect(el.style.top).toBe("240px");
|
||||
expect(el.querySelector("svg")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("colours the envelope by activity kind", () => {
|
||||
const stroke = (kind: "send" | "receive" | "task") => {
|
||||
const { container } = render(
|
||||
<FlightEnvelope from={{ x: 0, y: 0 }} to={{ x: 10, y: 10 }} kind={kind} />,
|
||||
);
|
||||
const s = container.querySelector("rect")?.getAttribute("stroke");
|
||||
cleanup();
|
||||
return s;
|
||||
};
|
||||
expect(stroke("send")).toBe("#22d3ee");
|
||||
expect(stroke("receive")).toBe("#8b5cf6");
|
||||
expect(stroke("task")).toBe("#f5a623");
|
||||
});
|
||||
|
||||
it("degrades to a static render (no throw) when Element.animate is unavailable", () => {
|
||||
// jsdom does not implement Element.animate — the component must still render.
|
||||
expect(typeof document.createElement("div").animate).not.toBe("function");
|
||||
const { getByTestId } = render(
|
||||
<FlightEnvelope from={{ x: 0, y: 0 }} to={{ x: 1, y: 1 }} kind="task" />,
|
||||
);
|
||||
expect(getByTestId("flight-envelope")).toBeTruthy();
|
||||
});
|
||||
});
|
||||
@@ -82,8 +82,19 @@
|
||||
/* ===== MAIN ===== */
|
||||
.main { flex: 1; display: flex; flex-direction: column; min-width: 0; }
|
||||
.topbar { height: 56px; flex: 0 0 56px; border-bottom: 1px solid var(--hair); background: var(--panel); display: flex; align-items: center; justify-content: space-between; padding: 0 18px 0 20px; }
|
||||
.org { display: flex; align-items: center; gap: 10px; cursor: pointer; padding: 6px 10px; border-radius: 9px; transition: .16s; margin-left: -6px; }
|
||||
.org { position: relative; display: flex; align-items: center; gap: 10px; cursor: pointer; padding: 6px 10px; border-radius: 9px; transition: .16s; margin-left: -6px; }
|
||||
.org:hover { background: var(--hair); }
|
||||
/* Org switcher dropdown */
|
||||
.orgMenu { position: absolute; top: calc(100% + 6px); left: 0; min-width: 220px; max-height: 320px; overflow-y: auto; padding: 5px; background: var(--bg-1, #1a1a22); border: 1px solid var(--hair-2); border-radius: 11px; box-shadow: 0 12px 32px rgba(0,0,0,.4); z-index: 50; }
|
||||
.orgMenuItem { width: 100%; display: flex; align-items: center; gap: 9px; padding: 7px 9px; border: none; background: transparent; border-radius: 8px; cursor: pointer; color: var(--tx-1); font-size: 13.5px; font-weight: 500; text-align: left; transition: .12s; }
|
||||
.orgMenuItem:hover { background: var(--hair); }
|
||||
.orgMenuCurrent { font-weight: 700; }
|
||||
.orgMenuBadge { width: 20px; height: 20px; border-radius: 6px; display: grid; place-items: center; background: linear-gradient(150deg,#2d2d36,#3a3a46); font-size: 11px; font-weight: 700; color: #d8d8e2; border: 1px solid var(--hair-2); flex: 0 0 auto; }
|
||||
:global([data-theme="light"]) .orgMenuBadge { background: linear-gradient(150deg,#7c3aed,#a78bfa); color: #fff; border: none; }
|
||||
.orgMenuName { flex: 1 1 auto; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
||||
.orgMenuTick { color: var(--accent, #a78bfa); display: flex; flex: 0 0 auto; }
|
||||
.orgMenuTick svg { width: 14px; height: 14px; }
|
||||
.orgMenuEmpty { padding: 9px 11px; color: var(--tx-3); font-size: 13px; }
|
||||
.orgBadge { width: 24px; height: 24px; border-radius: 7px; display: grid; place-items: center; background: linear-gradient(150deg,#2d2d36,#3a3a46); font-size: 12px; font-weight: 700; color: #d8d8e2; border: 1px solid var(--hair-2); }
|
||||
:global([data-theme="light"]) .orgBadge { background: linear-gradient(150deg,#7c3aed,#a78bfa); color: #fff; border: none; }
|
||||
.orgName { font-weight: 600; font-size: 14.5px; letter-spacing: -.01em; }
|
||||
|
||||
@@ -4,7 +4,8 @@ import { useCallback, useEffect, useMemo, useState } from "react";
|
||||
import { useCanvasStore, type TopView } from "@/store/canvas";
|
||||
import { WORKSPACE_KIND } from "@/lib/workspace-kind";
|
||||
import { useTheme } from "@/lib/theme-provider";
|
||||
import { api } from "@/lib/api";
|
||||
import { api, PLATFORM_URL } from "@/lib/api";
|
||||
import { switchOrgUrl } from "@/lib/org-switch";
|
||||
import { showToast } from "@/components/Toaster";
|
||||
import type { ActivityEntry } from "@/types/activity";
|
||||
import { Canvas } from "@/components/Canvas";
|
||||
@@ -89,6 +90,25 @@ function activityText(a: ActivityEntry): string {
|
||||
return a.method ? `${verb} · ${a.method}` : verb;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* resolveHomeChatTarget — which agent the Home chat panel talks to: the
|
||||
* sidebar-selected node when it still exists, else the org root (concierge),
|
||||
* else null. Resolving against live nodes means a deleted/vanished selection
|
||||
* degrades to the root instead of a dead chat. Exported for unit tests.
|
||||
*/
|
||||
export function resolveHomeChatTarget<N extends { id: string }>(
|
||||
nodes: N[],
|
||||
selectedNodeId: string | null,
|
||||
platformRoot: N | null,
|
||||
): N | null {
|
||||
if (selectedNodeId) {
|
||||
const selected = nodes.find((n) => n.id === selectedNodeId);
|
||||
if (selected) return selected;
|
||||
}
|
||||
return platformRoot ?? null;
|
||||
}
|
||||
|
||||
export function ConciergeShell() {
|
||||
const nodes = useCanvasStore((st) => st.nodes);
|
||||
const topView = useCanvasStore((st) => st.topView);
|
||||
@@ -108,13 +128,18 @@ export function ConciergeShell() {
|
||||
// returns an empty name, so the topbar never breaks before the backend
|
||||
// lands.
|
||||
const [orgName, setOrgName] = useState("Molecule AI");
|
||||
// Current org slug (from GET /org/identity) — used to highlight the active
|
||||
// org in the switcher and to derive the apex domain for cross-org navigation.
|
||||
const [orgSlug, setOrgSlug] = useState("");
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
api
|
||||
.get<{ name?: string }>("/org/identity")
|
||||
.get<{ name?: string; slug?: string }>("/org/identity")
|
||||
.then((r) => {
|
||||
const name = (r?.name || "").trim();
|
||||
if (!cancelled && name) setOrgName(name);
|
||||
const slug = (r?.slug || "").trim();
|
||||
if (!cancelled && slug) setOrgSlug(slug);
|
||||
})
|
||||
.catch(() => {
|
||||
// No endpoint / not reachable — keep the "Molecule AI" fallback.
|
||||
@@ -124,6 +149,47 @@ export function ConciergeShell() {
|
||||
};
|
||||
}, []);
|
||||
|
||||
// --- Org switcher (topbar dropdown) ---
|
||||
// Each org is its own tenant subdomain, so "switch" = navigate to
|
||||
// <slug>.<apex>. The org list comes from the control plane (cross-origin,
|
||||
// cookie-auth), fetched lazily the first time the menu opens.
|
||||
const [orgMenuOpen, setOrgMenuOpen] = useState(false);
|
||||
const [orgs, setOrgs] = useState<Array<{ slug: string; name?: string; id?: string }> | null>(null);
|
||||
const toggleOrgMenu = useCallback(() => {
|
||||
setOrgMenuOpen((open) => {
|
||||
const next = !open;
|
||||
if (next && orgs === null) {
|
||||
fetch(`${PLATFORM_URL}/cp/orgs`, {
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
})
|
||||
.then((res) => (res.ok ? res.json() : Promise.reject(new Error(String(res.status)))))
|
||||
.then((body: { orgs?: Array<{ slug: string; name?: string; id?: string }> } | Array<{ slug: string; name?: string; id?: string }>) => {
|
||||
const list = Array.isArray(body) ? body : body.orgs ?? [];
|
||||
setOrgs(list.filter((o) => o && o.slug));
|
||||
})
|
||||
.catch(() => setOrgs([])); // no list / not reachable → render "no other orgs"
|
||||
}
|
||||
return next;
|
||||
});
|
||||
}, [orgs]);
|
||||
const switchOrg = useCallback(
|
||||
(slug: string) => {
|
||||
setOrgMenuOpen(false);
|
||||
if (typeof window === "undefined") return;
|
||||
const url = switchOrgUrl(window.location.hostname, window.location.protocol, orgSlug, slug);
|
||||
if (url) window.location.href = url;
|
||||
},
|
||||
[orgSlug]
|
||||
);
|
||||
// Close the menu on any outside click.
|
||||
useEffect(() => {
|
||||
if (!orgMenuOpen) return;
|
||||
const onDoc = () => setOrgMenuOpen(false);
|
||||
document.addEventListener("click", onDoc);
|
||||
return () => document.removeEventListener("click", onDoc);
|
||||
}, [orgMenuOpen]);
|
||||
|
||||
// Build the agent hierarchy from live nodes.
|
||||
const { roots, childrenOf } = useMemo(() => {
|
||||
const childrenOf = new Map<string, typeof nodes>();
|
||||
@@ -157,6 +223,16 @@ export function ConciergeShell() {
|
||||
|
||||
const platformId = platformRoot?.id ?? null;
|
||||
|
||||
// Home chat target: the agent SELECTED in the sidebar, falling back to the
|
||||
// org root (the concierge). Pre-fix the panel was hard-pointed at the root,
|
||||
// so clicking another agent highlighted it but the chat never switched.
|
||||
const chatNode = useMemo(
|
||||
() => resolveHomeChatTarget(nodes, selectedNodeId, platformRoot),
|
||||
[nodes, selectedNodeId, platformRoot],
|
||||
);
|
||||
const chatId = chatNode?.id ?? null;
|
||||
const chatIsRoot = chatId !== null && chatId === platformId;
|
||||
|
||||
// ── live data: approvals + user-tasks (org-wide), activity (platform agent) ──
|
||||
const [approvals, setApprovals] = useState<PendingApproval[]>([]);
|
||||
const [userTasks, setUserTasks] = useState<UserTask[]>([]);
|
||||
@@ -330,10 +406,51 @@ export function ConciergeShell() {
|
||||
<div className={s.main}>
|
||||
{/* TOPBAR */}
|
||||
<header className={s.topbar}>
|
||||
<div className={s.org}>
|
||||
<div
|
||||
className={s.org}
|
||||
role="button"
|
||||
tabIndex={0}
|
||||
aria-haspopup="menu"
|
||||
aria-expanded={orgMenuOpen}
|
||||
data-testid="topbar-org-switcher"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
toggleOrgMenu();
|
||||
}}
|
||||
>
|
||||
<div className={s.orgBadge}>{initials(orgName).slice(0, 1)}</div>
|
||||
<span data-testid="topbar-org-name" className={s.orgName}>{orgName}</span>
|
||||
<span className={s.chev}><IcChevDown /></span>
|
||||
{orgMenuOpen && (
|
||||
<div
|
||||
className={s.orgMenu}
|
||||
role="menu"
|
||||
data-testid="topbar-org-menu"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
>
|
||||
{orgs === null ? (
|
||||
<div className={s.orgMenuEmpty}>Loading…</div>
|
||||
) : orgs.length === 0 ? (
|
||||
<div className={s.orgMenuEmpty}>No other organizations</div>
|
||||
) : (
|
||||
orgs.map((o) => (
|
||||
<button
|
||||
key={o.id || o.slug}
|
||||
type="button"
|
||||
role="menuitem"
|
||||
className={`${s.orgMenuItem} ${o.slug === orgSlug ? s.orgMenuCurrent : ""}`}
|
||||
onClick={() => switchOrg(o.slug)}
|
||||
>
|
||||
<span className={s.orgMenuBadge}>{initials(o.name || o.slug).slice(0, 1)}</span>
|
||||
<span className={s.orgMenuName}>{o.name || o.slug}</span>
|
||||
{o.slug === orgSlug && (
|
||||
<span className={s.orgMenuTick}><IcCheck /></span>
|
||||
)}
|
||||
</button>
|
||||
))
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<div className={s.topbarRight}>
|
||||
<button className={s.iconPill} title="Search"><IcSearch /></button>
|
||||
@@ -457,24 +574,24 @@ export function ConciergeShell() {
|
||||
delivery-mode handling), pointed at the platform agent. A thin
|
||||
concierge-styled header keeps the Home look; the ChatTab body
|
||||
below is identical to the map path so features can't drift. */}
|
||||
{platformId && platformRoot ? (
|
||||
{chatId && chatNode ? (
|
||||
<section className={s.chat}>
|
||||
<div className={s.chatHead}>
|
||||
<div className={s.chAv}><IcChat /></div>
|
||||
<div className={s.chMeta}>
|
||||
<div className={s.chTitle}>{platformRoot.data.name ?? "Org Concierge"}</div>
|
||||
<div className={s.chTitle}>{chatNode.data.name ?? (chatIsRoot ? "Org Concierge" : "Agent")}</div>
|
||||
<div className={s.chSub}>
|
||||
{(() => {
|
||||
const online =
|
||||
platformRoot.data.status === "online" ||
|
||||
platformRoot.data.status === "degraded";
|
||||
chatNode.data.status === "online" ||
|
||||
chatNode.data.status === "degraded";
|
||||
return (
|
||||
<>
|
||||
<span
|
||||
className={s.sdot}
|
||||
style={{ background: online ? "var(--green)" : "var(--grey)" }}
|
||||
/>
|
||||
{online ? "online" : statusInfo(platformRoot.data.status ?? "").label} · platform agent
|
||||
{online ? "online" : statusInfo(chatNode.data.status ?? "").label} · {chatIsRoot ? "platform agent" : (chatNode.data.role || "agent")}
|
||||
</>
|
||||
);
|
||||
})()}
|
||||
@@ -482,7 +599,9 @@ export function ConciergeShell() {
|
||||
</div>
|
||||
</div>
|
||||
<div className={s.embedChat}>
|
||||
<ChatTab key={platformId} workspaceId={platformId} data={platformRoot.data} />
|
||||
{/* key=chatId remounts ChatTab on selection change so the
|
||||
history/composer state never bleeds between agents. */}
|
||||
<ChatTab key={chatId} workspaceId={chatId} data={chatNode.data} />
|
||||
</div>
|
||||
</section>
|
||||
) : (
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
// Home chat panel target — selecting an agent in the sidebar switches the
|
||||
// chat; the root is only the DEFAULT, not a hard-point (the pre-fix bug).
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { resolveHomeChatTarget } from "../ConciergeShell";
|
||||
|
||||
const root = { id: "root" };
|
||||
const child = { id: "child" };
|
||||
const nodes = [root, child];
|
||||
|
||||
describe("resolveHomeChatTarget", () => {
|
||||
it("returns the selected agent when it exists (the bug: chat stayed on root)", () => {
|
||||
expect(resolveHomeChatTarget(nodes, "child", root)).toBe(child);
|
||||
});
|
||||
it("falls back to the platform root when nothing is selected", () => {
|
||||
expect(resolveHomeChatTarget(nodes, null, root)).toBe(root);
|
||||
});
|
||||
it("degrades to the root when the selection no longer exists (deleted agent)", () => {
|
||||
expect(resolveHomeChatTarget(nodes, "gone", root)).toBe(root);
|
||||
});
|
||||
it("selecting the root itself targets the root", () => {
|
||||
expect(resolveHomeChatTarget(nodes, "root", root)).toBe(root);
|
||||
});
|
||||
it("null when there is neither selection nor root", () => {
|
||||
expect(resolveHomeChatTarget([], null, null)).toBeNull();
|
||||
});
|
||||
});
|
||||
@@ -7,29 +7,44 @@ import { isSaaSTenant } from "@/lib/tenant";
|
||||
import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
|
||||
import type { WorkspaceCompute } from "@/store/socket";
|
||||
|
||||
// Machine sizes keyed by cloud provider — an AWS t3.* is meaningless on Hetzner,
|
||||
// etc. MUST mirror the workspace-server workspaceComputeInstanceAllowlist (which
|
||||
// mirrors the CP provider configs); the PATCH validation rejects a mismatch 400.
|
||||
const INSTANCE_TYPES_BY_PROVIDER: Record<string, string[]> = {
|
||||
aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
|
||||
hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
|
||||
gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
|
||||
// Cloud-provider + instance-type metadata (core#2489).
|
||||
//
|
||||
// SSOT lives in the workspace-server (workspace_compute.go's allowlist + defaults)
|
||||
// and is fetched at runtime from GET /workspaces/:id/compute-options, so the UI
|
||||
// can never offer a (provider, instance-type) the PATCH validation then rejects
|
||||
// with a 400. The constants below are ONLY a minimal offline fallback used until
|
||||
// the fetch resolves (or if it fails) — they mirror the server SSOT but are not
|
||||
// the source of truth. When the fetch succeeds, its data replaces them entirely.
|
||||
type ComputeOptions = {
|
||||
providers: string[];
|
||||
instanceTypes: Record<string, string[]>;
|
||||
defaults: Record<string, string>;
|
||||
};
|
||||
const DEFAULT_INSTANCE_BY_PROVIDER: Record<string, string> = {
|
||||
aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2",
|
||||
};
|
||||
const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
|
||||
const instanceTypesForProvider = (p?: string): string[] =>
|
||||
INSTANCE_TYPES_BY_PROVIDER[normalizeProvider(p)] ?? INSTANCE_TYPES_BY_PROVIDER.aws;
|
||||
const defaultInstanceForProvider = (p?: string): string =>
|
||||
DEFAULT_INSTANCE_BY_PROVIDER[normalizeProvider(p)] ?? "t3.medium";
|
||||
|
||||
// Editable cloud-provider options (multi-provider RFC) — mirrors CreateWorkspaceDialog.
|
||||
const CLOUD_PROVIDER_OPTIONS = [
|
||||
{ value: "aws", label: "AWS (default)" },
|
||||
{ value: "gcp", label: "GCP" },
|
||||
{ value: "hetzner", label: "Hetzner" },
|
||||
];
|
||||
const FALLBACK_COMPUTE_OPTIONS: ComputeOptions = {
|
||||
providers: ["aws", "hetzner", "gcp"],
|
||||
instanceTypes: {
|
||||
aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
|
||||
hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
|
||||
gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
|
||||
},
|
||||
defaults: { aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2" },
|
||||
};
|
||||
|
||||
const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
|
||||
const instanceTypesForProvider = (opts: ComputeOptions, p?: string): string[] =>
|
||||
opts.instanceTypes[normalizeProvider(p)] ?? opts.instanceTypes.aws ?? FALLBACK_COMPUTE_OPTIONS.instanceTypes.aws;
|
||||
const defaultInstanceForProvider = (opts: ComputeOptions, p?: string): string =>
|
||||
opts.defaults[normalizeProvider(p)] ?? "t3.medium";
|
||||
|
||||
// Human labels for the cloud-provider selector. The option VALUES come from the
|
||||
// fetched SSOT (opts.providers); this only supplies display text + the default tag.
|
||||
const CLOUD_PROVIDER_LABELS: Record<string, string> = {
|
||||
aws: "AWS (default)",
|
||||
gcp: "GCP",
|
||||
hetzner: "Hetzner",
|
||||
};
|
||||
const cloudProviderOptionLabel = (v: string): string => CLOUD_PROVIDER_LABELS[v] ?? v;
|
||||
|
||||
const RUNTIME_OPTIONS = ["claude-code", "codex", "hermes", "openclaw", "kimi", "kimi-cli", "external"];
|
||||
const RESOLUTIONS = ["1280x720", "1440x900", "1920x1080", "2560x1440"];
|
||||
@@ -87,6 +102,12 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [success, setSuccess] = useState(false);
|
||||
// core#2489: provider + instance-type dropdowns are populated from the
|
||||
// workspace-server SSOT (GET /workspaces/:id/compute-options) so they can't
|
||||
// drift from what the PATCH validation accepts. Start from the offline fallback
|
||||
// and replace it once the fetch resolves; on fetch error we keep the fallback
|
||||
// (the dropdowns still work, just from the in-bundle mirror).
|
||||
const [computeOptions, setComputeOptions] = useState<ComputeOptions>(FALLBACK_COMPUTE_OPTIONS);
|
||||
|
||||
useEffect(() => {
|
||||
setForm(initial);
|
||||
@@ -94,6 +115,30 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
setSuccess(false);
|
||||
}, [initial]);
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
(async () => {
|
||||
try {
|
||||
const opts = await api.get<Partial<ComputeOptions>>(`/workspaces/${workspaceId}/compute-options`);
|
||||
if (cancelled) return;
|
||||
// Defensive: only adopt a well-formed payload; otherwise keep the fallback.
|
||||
if (opts && Array.isArray(opts.providers) && opts.providers.length > 0 && opts.instanceTypes && opts.defaults) {
|
||||
setComputeOptions({
|
||||
providers: opts.providers,
|
||||
instanceTypes: opts.instanceTypes,
|
||||
defaults: opts.defaults,
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
// Fetch failed (offline / older server) — keep FALLBACK_COMPUTE_OPTIONS.
|
||||
// The dropdowns stay usable; worst case they show the in-bundle mirror.
|
||||
}
|
||||
})();
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [workspaceId]);
|
||||
|
||||
const workspaceAccess = formatAccess(data.workspaceAccess);
|
||||
const maxConcurrentTasks = data.maxConcurrentTasks ? String(data.maxConcurrentTasks) : "platform-managed";
|
||||
const deliveryMode = data.deliveryMode || "push";
|
||||
@@ -208,8 +253,8 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
id="cloud-provider"
|
||||
label="Cloud provider"
|
||||
value={normalizeProvider(form.provider)}
|
||||
options={CLOUD_PROVIDER_OPTIONS.map((p) => p.value)}
|
||||
optionLabel={(v) => CLOUD_PROVIDER_OPTIONS.find((p) => p.value === v)?.label ?? v}
|
||||
options={computeOptions.providers}
|
||||
optionLabel={cloudProviderOptionLabel}
|
||||
// Switching cloud resets the instance type to the new provider's
|
||||
// default (an AWS t3.* is invalid on Hetzner, etc.) — also keeps the
|
||||
// instance-type dropdown below in sync with the provider's sizes.
|
||||
@@ -217,9 +262,9 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
setForm((s) => ({
|
||||
...s,
|
||||
provider,
|
||||
instanceType: instanceTypesForProvider(provider).includes(s.instanceType)
|
||||
instanceType: instanceTypesForProvider(computeOptions, provider).includes(s.instanceType)
|
||||
? s.instanceType
|
||||
: defaultInstanceForProvider(provider),
|
||||
: defaultInstanceForProvider(computeOptions, provider),
|
||||
}))
|
||||
}
|
||||
/>
|
||||
@@ -228,7 +273,7 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
id="instance-type"
|
||||
label="Instance type"
|
||||
value={form.instanceType}
|
||||
options={instanceTypesForProvider(form.provider)}
|
||||
options={instanceTypesForProvider(computeOptions, form.provider)}
|
||||
onChange={(instanceType) => setForm((s) => ({ ...s, instanceType }))}
|
||||
/>
|
||||
<label className="grid gap-1" htmlFor="root-volume-gb">
|
||||
@@ -348,7 +393,10 @@ function formFromData(data: {
|
||||
return {
|
||||
runtime: data.runtime || "claude-code",
|
||||
provider,
|
||||
instanceType: data.instanceType || defaultInstanceForProvider(provider),
|
||||
// Falls back to the offline default only when no instance type is persisted;
|
||||
// the server SSOT default matches FALLBACK_COMPUTE_OPTIONS, and the dropdown
|
||||
// re-syncs to the fetched options once they resolve.
|
||||
instanceType: data.instanceType || defaultInstanceForProvider(FALLBACK_COMPUTE_OPTIONS, provider),
|
||||
rootGB: String(data.rootGB || DEFAULT_HEADLESS_ROOT_GB),
|
||||
displayEnabled: !!data.displayMode && data.displayMode !== "none",
|
||||
displayMode: data.displayMode && data.displayMode !== "none" ? data.displayMode : "desktop-control",
|
||||
|
||||
@@ -3,12 +3,14 @@ import { cleanup, fireEvent, render, screen, waitFor } from "@testing-library/re
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const apiPatch = vi.fn();
|
||||
const apiGet = vi.fn();
|
||||
const updateNodeData = vi.fn();
|
||||
const restartWorkspace = vi.fn();
|
||||
|
||||
vi.mock("@/lib/api", () => ({
|
||||
api: {
|
||||
patch: (path: string, body: unknown) => apiPatch(path, body),
|
||||
get: (path: string) => apiGet(path),
|
||||
},
|
||||
}));
|
||||
|
||||
@@ -38,6 +40,12 @@ afterEach(() => {
|
||||
|
||||
beforeEach(() => {
|
||||
apiPatch.mockReset();
|
||||
apiGet.mockReset();
|
||||
// Default: compute-options fetch rejects → component keeps its in-bundle
|
||||
// fallback SSOT. Existing assertions (t3.medium / cpx31 / provider list) are
|
||||
// satisfied by the fallback, which mirrors the server. Individual tests that
|
||||
// exercise the fetch path override this with mockResolvedValueOnce.
|
||||
apiGet.mockRejectedValue(new Error("no compute-options in this test"));
|
||||
restartWorkspace.mockReset();
|
||||
updateNodeData.mockReset();
|
||||
});
|
||||
@@ -358,6 +366,76 @@ describe("ContainerConfigTab", () => {
|
||||
confirmSpy.mockRestore();
|
||||
});
|
||||
|
||||
// core#2489: the provider + instance-type dropdowns are populated from the
|
||||
// workspace-server SSOT (GET /workspaces/:id/compute-options), so the UI can't
|
||||
// offer an option the backend then rejects. This proves the fetch drives the
|
||||
// dropdowns: a server-only instance type appears once the fetch resolves.
|
||||
it("populates instance-type options from the compute-options SSOT endpoint", async () => {
|
||||
apiGet.mockResolvedValueOnce({
|
||||
providers: ["aws", "hetzner", "gcp"],
|
||||
instanceTypes: {
|
||||
aws: ["t3.medium", "t3.large", "z9.future"], // z9.future is server-only
|
||||
hetzner: ["cpx31"],
|
||||
gcp: ["e2-standard-2"],
|
||||
},
|
||||
defaults: { aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2" },
|
||||
});
|
||||
|
||||
render(
|
||||
<ContainerConfigTab
|
||||
workspaceId="ws-opts"
|
||||
data={{
|
||||
runtime: "claude-code",
|
||||
status: "online",
|
||||
needsRestart: false,
|
||||
activeTasks: 0,
|
||||
maxConcurrentTasks: null,
|
||||
workspaceAccess: "none",
|
||||
deliveryMode: "push",
|
||||
compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
|
||||
}}
|
||||
/>,
|
||||
);
|
||||
|
||||
await waitFor(() => expect(apiGet).toHaveBeenCalledWith("/workspaces/ws-opts/compute-options"));
|
||||
// The server-only instance type appears in the dropdown after the fetch.
|
||||
await waitFor(() =>
|
||||
expect(
|
||||
Array.from(screen.getByLabelText("Instance type").querySelectorAll("option")).map((o) => o.getAttribute("value")),
|
||||
).toContain("z9.future"),
|
||||
);
|
||||
});
|
||||
|
||||
// core#2489: if the compute-options fetch fails, the dropdowns must stay usable
|
||||
// via the in-bundle fallback (no crash, no empty selector).
|
||||
it("falls back to the in-bundle option set when the compute-options fetch fails", async () => {
|
||||
apiGet.mockRejectedValueOnce(new Error("network down"));
|
||||
|
||||
render(
|
||||
<ContainerConfigTab
|
||||
workspaceId="ws-opts"
|
||||
data={{
|
||||
runtime: "claude-code",
|
||||
status: "online",
|
||||
needsRestart: false,
|
||||
activeTasks: 0,
|
||||
maxConcurrentTasks: null,
|
||||
workspaceAccess: "none",
|
||||
deliveryMode: "push",
|
||||
compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
|
||||
}}
|
||||
/>,
|
||||
);
|
||||
|
||||
await waitFor(() => expect(apiGet).toHaveBeenCalled());
|
||||
// Fallback list still renders the known AWS sizes.
|
||||
const values = Array.from(
|
||||
screen.getByLabelText("Instance type").querySelectorAll("option"),
|
||||
).map((o) => o.getAttribute("value"));
|
||||
expect(values).toContain("t3.medium");
|
||||
expect(values).toContain("m6i.xlarge");
|
||||
});
|
||||
|
||||
it("does not treat a non-provider edit as a recreate (no confirm; aws default omitted)", async () => {
|
||||
const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
|
||||
render(
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { switchOrgUrl } from "../org-switch";
|
||||
|
||||
describe("switchOrgUrl", () => {
|
||||
it("builds the target org's subdomain URL from the current host", () => {
|
||||
expect(
|
||||
switchOrgUrl("agents-team.moleculesai.app", "https:", "agents-team", "reno-stars"),
|
||||
).toBe("https://reno-stars.moleculesai.app");
|
||||
});
|
||||
|
||||
it("returns null for a no-op (switching to the current org)", () => {
|
||||
expect(
|
||||
switchOrgUrl("agents-team.moleculesai.app", "https:", "agents-team", "agents-team"),
|
||||
).toBeNull();
|
||||
});
|
||||
|
||||
it("returns null when the target slug is empty", () => {
|
||||
expect(switchOrgUrl("a.example.com", "https:", "a", "")).toBeNull();
|
||||
});
|
||||
|
||||
it("falls back to dropping the first label when currentSlug doesn't prefix the host", () => {
|
||||
expect(switchOrgUrl("foo.example.com", "https:", "", "bar")).toBe(
|
||||
"https://bar.example.com",
|
||||
);
|
||||
});
|
||||
|
||||
it("returns null when there is no apex to derive (single-label host)", () => {
|
||||
expect(switchOrgUrl("localhost", "http:", "", "bar")).toBeNull();
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,23 @@
|
||||
// Org switching across tenant subdomains.
|
||||
//
|
||||
// Each org is its own tenant at <slug>.<apex> (e.g. agents-team.moleculesai.app),
|
||||
// so switching orgs from the canvas topbar means navigating to the target org's
|
||||
// subdomain. switchOrgUrl derives that URL from the current location, or returns
|
||||
// null when it's a no-op (same org / empty target) or the apex can't be resolved.
|
||||
|
||||
export function switchOrgUrl(
|
||||
hostname: string,
|
||||
protocol: string,
|
||||
currentSlug: string,
|
||||
targetSlug: string,
|
||||
): string | null {
|
||||
if (!targetSlug || targetSlug === currentSlug) return null;
|
||||
// Prefer stripping the known current-org label; otherwise drop the first
|
||||
// label as a best-effort apex (covers hosts we didn't seed a slug for).
|
||||
const apex =
|
||||
currentSlug && hostname.startsWith(`${currentSlug}.`)
|
||||
? hostname.slice(currentSlug.length + 1)
|
||||
: hostname.split(".").slice(1).join(".");
|
||||
if (!apex) return null;
|
||||
return `${protocol}//${targetSlug}.${apex}`;
|
||||
}
|
||||
@@ -76,7 +76,7 @@ fi
|
||||
log "Step 3 — Seed a file inside /workspace and ask agent to reference it"
|
||||
# Relies on /workspace being writable by the platform (we copy as root via
|
||||
# docker exec, mimicking the path a real agent would use through its tools).
|
||||
CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID:0:12}" | head -1)
|
||||
CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID}" | head -1)
|
||||
[ -n "$CONTAINER" ] || { echo "container not found"; exit 1; }
|
||||
docker exec "$CONTAINER" sh -c 'echo "E2E report body $(date -u +%s)" > /workspace/e2e-report.txt'
|
||||
|
||||
|
||||
@@ -145,7 +145,7 @@ check_runtime() {
|
||||
fails=$((fails + 1)); return
|
||||
fi
|
||||
local container
|
||||
container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid:0:12}" | head -1)
|
||||
container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid}" | head -1)
|
||||
[ -z "$container" ] && { echo "FAIL $label: container not found"; fails=$((fails + 1)); return; }
|
||||
|
||||
has_patch_in_container "$container" || { echo "FAIL $label: platform helpers missing"; fails=$((fails + 1)); return; }
|
||||
|
||||
@@ -94,8 +94,8 @@ check_contains "Upload child prompt" "replaced" "$CHILD_UPLOAD"
|
||||
|
||||
# Verify prompts in containers
|
||||
sleep 2
|
||||
ROOT_CONTAINER=$(docker ps --filter "name=ws-${ROOT:0:12}" -q | head -1)
|
||||
CHILD_CONTAINER=$(docker ps --filter "name=ws-${CHILD:0:12}" -q | head -1)
|
||||
ROOT_CONTAINER=$(docker ps --filter "name=ws-${ROOT}" -q | head -1)
|
||||
CHILD_CONTAINER=$(docker ps --filter "name=ws-${CHILD}" -q | head -1)
|
||||
|
||||
ROOT_HAS_PROMPT=$(docker exec $ROOT_CONTAINER cat /configs/system-prompt.md 2>/dev/null | head -1)
|
||||
check_contains "Root container has prompt" "Root Agent" "$ROOT_HAS_PROMPT"
|
||||
|
||||
@@ -153,19 +153,17 @@ RT_HM_ID=$(echo "$R" | jq_extract "['id']")
|
||||
|
||||
# Wait for containers to start (poll up to 30s for first one to appear)
|
||||
if command -v docker &>/dev/null; then
|
||||
short_cc="${RT_CC_ID:0:12}"
|
||||
for _ in 1 2 3 4 5 6; do
|
||||
sleep 5
|
||||
if docker inspect "ws-${short_cc}" >/dev/null 2>&1; then break; fi
|
||||
if docker inspect "ws-${RT_CC_ID}" >/dev/null 2>&1; then break; fi
|
||||
done
|
||||
|
||||
_check_image() {
|
||||
local ws_id="$1" expected_tag="$2" label="$3"
|
||||
local short_id="${ws_id:0:12}"
|
||||
# Poll up to 30s for image to appear
|
||||
local actual_image="NOT_FOUND"
|
||||
for _ in 1 2 3 4 5 6; do
|
||||
actual_image=$(docker inspect "ws-${short_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "NOT_FOUND")
|
||||
actual_image=$(docker inspect "ws-${ws_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "NOT_FOUND")
|
||||
if echo "$actual_image" | grep -qF "$expected_tag"; then break; fi
|
||||
sleep 5
|
||||
done
|
||||
@@ -216,10 +214,9 @@ if echo "$R" | grep -qF "saved"; then
|
||||
curl -s -X POST "$BASE/workspaces/$RT_CX_ID/restart" > /dev/null 2>&1
|
||||
# Poll up to 30s for the new container image to appear (restart can take a while)
|
||||
if command -v docker &>/dev/null; then
|
||||
short_id="${RT_CX_ID:0:12}"
|
||||
for _ in 1 2 3 4 5 6; do
|
||||
sleep 5
|
||||
actual=$(docker inspect "ws-${short_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "")
|
||||
actual=$(docker inspect "ws-${RT_CX_ID}" --format '{{.Config.Image}}' 2>/dev/null || echo "")
|
||||
if echo "$actual" | grep -qF "openclaw"; then break; fi
|
||||
done
|
||||
_check_image "$RT_CX_ID" "openclaw" "Runtime change codex to openclaw on restart"
|
||||
|
||||
@@ -191,8 +191,29 @@ except Exception:
|
||||
}
|
||||
|
||||
container_running() { # container_running <ws-id> -> echoes name if running
|
||||
local short="${1:0:12}"
|
||||
docker ps --filter "name=ws-${short}" --filter "status=running" --format '{{.Names}}' 2>/dev/null | head -1
|
||||
docker ps --filter "name=ws-${1}" --filter "status=running" --format '{{.Names}}' 2>/dev/null | head -1
|
||||
}
|
||||
|
||||
diagnose_provision() {
|
||||
local wsid="${1:-}"
|
||||
local container
|
||||
container=$(container_running "$wsid")
|
||||
echo "--- DIAGNOSE provisioning for $wsid ---"
|
||||
echo "last_sample_error: ${LAST:-<none>}"
|
||||
echo "container_running: ${container:-<none>}"
|
||||
if [ -n "$container" ]; then
|
||||
echo "--- container logs ($container) ---"
|
||||
docker logs "$container" 2>&1 | tail -n 60 || true
|
||||
echo "--- container env ---"
|
||||
docker inspect "$container" --format '{{json .Config.Env}}' 2>&1 || true
|
||||
echo "--- container reachability test ---"
|
||||
docker exec "$container" sh -c 'echo "platform_url=$PLATFORM_URL"; wget -qO- "$PLATFORM_URL/health" 2>&1 || true' || true
|
||||
fi
|
||||
echo "--- all ws-* containers ---"
|
||||
docker ps --filter "name=ws-" --format '{{.Names}} {{.Status}}' 2>/dev/null || true
|
||||
echo "--- all ws-* volumes ---"
|
||||
docker volume ls -q 2>/dev/null | grep '^ws-' || true
|
||||
echo "--- end diagnose ---"
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
@@ -203,16 +224,11 @@ cleanup() {
|
||||
# SCOPED teardown — only the workspace this test created. Never a blanket
|
||||
# sweep (other dev workspaces may be live on this shared daemon).
|
||||
e2e_delete_workspace "$WSID" "" >/dev/null 2>&1 || true
|
||||
local short="${WSID:0:12}"
|
||||
docker rm -f "ws-${short}" >/dev/null 2>&1 || true
|
||||
# Volume naming is split in the provisioner: configs + claude-sessions use the
|
||||
# 12-char short id (ConfigVolumeName/ClaudeSessionVolumeName), but the
|
||||
# /workspace volume uses the FULL UUID (buildWorkspaceMount: ws-<id>-workspace).
|
||||
# Remove BOTH forms so neither leaks.
|
||||
docker rm -f "ws-${WSID}" >/dev/null 2>&1 || true
|
||||
docker volume rm -f \
|
||||
"ws-${short}-configs" "ws-${short}-claude-sessions" \
|
||||
"ws-${short}-workspace" "ws-${WSID}-workspace" >/dev/null 2>&1 || true
|
||||
echo "cleaned workspace $WSID + ws-${short} container/volumes"
|
||||
"ws-${WSID}-configs" "ws-${WSID}-claude-sessions" \
|
||||
"ws-${WSID}-workspace" >/dev/null 2>&1 || true
|
||||
echo "cleaned workspace $WSID + ws-${WSID} container/volumes"
|
||||
fi
|
||||
# Restore the cache tag to whatever it pointed at before we retagged it, so a
|
||||
# stub run doesn't leave the real claude-code tag aliased to the stub.
|
||||
@@ -331,8 +347,7 @@ if [ -z "$WSID" ]; then
|
||||
exit 1
|
||||
fi
|
||||
pass "workspace created: $WSID"
|
||||
SHORT="${WSID:0:12}"
|
||||
CONFIG_VOL="ws-${SHORT}-configs"
|
||||
CONFIG_VOL="ws-${WSID}-configs"
|
||||
|
||||
# Mint a workspace bearer for the WorkspaceAuth-gated secret + /restart calls.
|
||||
WTOKEN=$(e2e_mint_workspace_token "$WSID" || true)
|
||||
@@ -436,8 +451,9 @@ for _ in $(seq 1 "$ONLINE_TIMEOUT"); do
|
||||
sleep 1
|
||||
done
|
||||
check "workspace reached online (status=$STATUS)" "online" "$STATUS"
|
||||
if [ "$FAIL" -gt 0 ]; then diagnose_provision "$WSID"; echo "=== Results: $PASS passed, $FAIL failed ==="; exit 1; fi
|
||||
RUN=$(container_running "$WSID")
|
||||
if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID:0:12} container" "docker ps shows none"; fi
|
||||
if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container" "docker ps shows none"; fi
|
||||
echo ""
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
@@ -473,6 +489,7 @@ else
|
||||
sleep 1
|
||||
done
|
||||
check "workspace back online after restart (status=$STATUS)" "online" "$STATUS"
|
||||
if [ "$FAIL" -gt 0 ]; then diagnose_provision "$WSID"; echo "=== Results: $PASS passed, $FAIL failed ==="; exit 1; fi
|
||||
# Explicit negative on the exact bug signature.
|
||||
if echo "$LAST" | grep -qiF "config volume is empty"; then
|
||||
fail "restart hit 'config volume is empty' — restart-survival REGRESSION" "$LAST"
|
||||
|
||||
@@ -33,6 +33,7 @@ import (
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
@@ -239,7 +240,7 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig(
|
||||
}
|
||||
}
|
||||
if len(base) == 0 && h.provisioner != nil {
|
||||
if b, err := h.provisioner.ExecRead(ctx, configDirName(workspaceID), "/configs/config.yaml"); err == nil {
|
||||
if b, err := h.provisioner.ExecRead(ctx, provisioner.ContainerName(workspaceID), "/configs/config.yaml"); err == nil {
|
||||
base = b
|
||||
}
|
||||
}
|
||||
@@ -399,7 +400,7 @@ func conciergeIdentityPresent(ctx context.Context, prov localProvisionerIsRunnin
|
||||
// that doesn't expose ExecRead.
|
||||
return true
|
||||
}
|
||||
body, err := reader.ExecRead(ctx, configDirName(id), "/configs/system-prompt.md")
|
||||
body, err := reader.ExecRead(ctx, provisioner.ContainerName(id), "/configs/system-prompt.md")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -31,28 +31,72 @@ type workspaceDisplayResponse struct {
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// workspaceComputeInstanceAllowlist is keyed by cloud provider (multi-provider /
|
||||
// in-place switch): each provider's box accepts only that provider's machine
|
||||
// sizes (an AWS t3.* is meaningless on Hetzner, and vice-versa). Mirrors the CP
|
||||
// provider SSOT — keep in lock-step with the controlplane provider configs
|
||||
// (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2 t3*/m6i*/c6i*).
|
||||
// TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider pin the sets.
|
||||
// "" provider = AWS default.
|
||||
var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{
|
||||
// SSOT for cloud-provider + instance-type metadata (core#2489).
|
||||
//
|
||||
// This file is the SINGLE source of truth the workspace-server validates
|
||||
// against AND the canvas Container-Config tab renders its dropdowns from (via
|
||||
// GET /workspaces/:id/compute-options, see ComputeOptions below). Previously the
|
||||
// canvas hardcoded a parallel copy of these lists in ContainerConfigTab.tsx; the
|
||||
// two could drift so the UI offered a (provider, instance-type) the backend
|
||||
// allowlist then rejected with a 400. The canvas now derives its options from
|
||||
// this endpoint, so drift is impossible by construction.
|
||||
//
|
||||
// The ordered slices below are the canonical form. workspaceComputeInstanceAllowlist
|
||||
// (the O(1) validation set) is DERIVED from them in init(), so the ordered list
|
||||
// the canvas renders and the set the backend validates can never disagree.
|
||||
//
|
||||
// Mirrors the CP provider SSOT — keep in lock-step with the controlplane provider
|
||||
// configs (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2
|
||||
// t3*/m6i*/c6i*). TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider
|
||||
// pin the sets. "" provider = AWS default.
|
||||
|
||||
// workspaceComputeProvidersOrdered is the canonical provider order (AWS first =
|
||||
// default). The canvas renders the provider dropdown in this order.
|
||||
var workspaceComputeProvidersOrdered = []string{"aws", "hetzner", "gcp"}
|
||||
|
||||
// workspaceComputeInstanceTypesOrdered lists each provider's machine sizes in the
|
||||
// order the canvas should render them. An AWS t3.* is meaningless on Hetzner, and
|
||||
// vice-versa, so the set is provider-scoped.
|
||||
var workspaceComputeInstanceTypesOrdered = map[string][]string{
|
||||
"aws": {
|
||||
"t3.medium": {}, "t3.large": {}, "t3.xlarge": {}, "t3.2xlarge": {},
|
||||
"m6i.large": {}, "m6i.xlarge": {}, "c6i.xlarge": {},
|
||||
"t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge",
|
||||
"m6i.large", "m6i.xlarge", "c6i.xlarge",
|
||||
},
|
||||
"hetzner": {
|
||||
"cpx11": {}, "cpx21": {}, "cpx31": {}, "cpx41": {}, "cpx51": {},
|
||||
"cax11": {}, "cax21": {}, "cax31": {}, "cax41": {},
|
||||
"cpx11", "cpx21", "cpx31", "cpx41", "cpx51",
|
||||
"cax11", "cax21", "cax31", "cax41",
|
||||
},
|
||||
"gcp": {
|
||||
"e2-small": {}, "e2-medium": {},
|
||||
"e2-standard-2": {}, "e2-standard-4": {}, "e2-standard-8": {},
|
||||
"e2-small", "e2-medium",
|
||||
"e2-standard-2", "e2-standard-4", "e2-standard-8",
|
||||
},
|
||||
}
|
||||
|
||||
// workspaceComputeDefaultInstanceByProvider is the per-provider default machine
|
||||
// size the canvas pre-selects when switching providers (an AWS t3.* is invalid on
|
||||
// Hetzner, so the switch resets to the new provider's default).
|
||||
var workspaceComputeDefaultInstanceByProvider = map[string]string{
|
||||
"aws": "t3.medium",
|
||||
"hetzner": "cpx31",
|
||||
"gcp": "e2-standard-2",
|
||||
}
|
||||
|
||||
// workspaceComputeInstanceAllowlist is the O(1) validation set, keyed by cloud
|
||||
// provider. DERIVED from workspaceComputeInstanceTypesOrdered in init() so the
|
||||
// ordered list (what the canvas renders) and the set (what the backend validates)
|
||||
// stay in lock-step — you cannot add an instance type to one without the other.
|
||||
var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{}
|
||||
|
||||
func init() {
|
||||
for provider, types := range workspaceComputeInstanceTypesOrdered {
|
||||
set := make(map[string]struct{}, len(types))
|
||||
for _, t := range types {
|
||||
set[t] = struct{}{}
|
||||
}
|
||||
workspaceComputeInstanceAllowlist[provider] = set
|
||||
}
|
||||
}
|
||||
|
||||
// normalizeCloudProvider maps "" → "aws" so the in-place switch comparison
|
||||
// treats the default and an explicit "aws" as the same cloud (no spurious switch).
|
||||
func normalizeCloudProvider(p string) string {
|
||||
@@ -88,10 +132,15 @@ func instanceTypeAllowedForProvider(provider, instanceType string) bool {
|
||||
// change here (and the CP itself fail-closes an unwired provider with a 422).
|
||||
// "" = default (AWS) and is always accepted. This is the gate the switch-provider
|
||||
// flow reuses to reject a bad provider with a clean 400 before any CP round-trip.
|
||||
var workspaceComputeProviderAllowlist = map[string]struct{}{
|
||||
"aws": {},
|
||||
"gcp": {},
|
||||
"hetzner": {},
|
||||
// DERIVED from workspaceComputeProvidersOrdered (the SSOT, core#2489) in init() so
|
||||
// the set the backend validates and the ordered list the canvas renders cannot
|
||||
// drift.
|
||||
var workspaceComputeProviderAllowlist = map[string]struct{}{}
|
||||
|
||||
func init() {
|
||||
for _, p := range workspaceComputeProvidersOrdered {
|
||||
workspaceComputeProviderAllowlist[p] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
|
||||
@@ -262,6 +311,55 @@ func withStoredCompute(ctx context.Context, workspaceID string, payload models.C
|
||||
return payload
|
||||
}
|
||||
|
||||
// workspaceComputeOptionsResponse is the SSOT payload the canvas Container-Config
|
||||
// tab consumes to populate its provider + instance-type dropdowns (core#2489).
|
||||
// It is derived entirely from the allowlist + defaults in this file, so the UI
|
||||
// can never offer a (provider, instance-type) the backend then rejects.
|
||||
type workspaceComputeOptionsResponse struct {
|
||||
// Providers in canonical render order (AWS first = default).
|
||||
Providers []string `json:"providers"`
|
||||
// InstanceTypes per provider, in canonical render order.
|
||||
InstanceTypes map[string][]string `json:"instanceTypes"`
|
||||
// Defaults maps each provider → its default instance type (the canvas
|
||||
// pre-selects this when switching providers).
|
||||
Defaults map[string]string `json:"defaults"`
|
||||
}
|
||||
|
||||
// buildComputeOptions assembles the SSOT response from the allowlist + defaults.
|
||||
// Pure (no DB / no gin) so it can be unit-tested directly and reused.
|
||||
func buildComputeOptions() workspaceComputeOptionsResponse {
|
||||
providers := make([]string, len(workspaceComputeProvidersOrdered))
|
||||
copy(providers, workspaceComputeProvidersOrdered)
|
||||
|
||||
instanceTypes := make(map[string][]string, len(workspaceComputeInstanceTypesOrdered))
|
||||
for _, p := range providers {
|
||||
src := workspaceComputeInstanceTypesOrdered[p]
|
||||
dst := make([]string, len(src))
|
||||
copy(dst, src)
|
||||
instanceTypes[p] = dst
|
||||
}
|
||||
|
||||
defaults := make(map[string]string, len(workspaceComputeDefaultInstanceByProvider))
|
||||
for k, v := range workspaceComputeDefaultInstanceByProvider {
|
||||
defaults[k] = v
|
||||
}
|
||||
|
||||
return workspaceComputeOptionsResponse{
|
||||
Providers: providers,
|
||||
InstanceTypes: instanceTypes,
|
||||
Defaults: defaults,
|
||||
}
|
||||
}
|
||||
|
||||
// ComputeOptions handles GET /workspaces/:id/compute-options. It returns the
|
||||
// cloud-provider + instance-type metadata the canvas Container-Config tab renders
|
||||
// its dropdowns from — the SAME data validateWorkspaceCompute enforces (core#2489).
|
||||
// Static (derived from the in-binary allowlist), so it needs no DB round-trip; the
|
||||
// :id is scoped only by the WorkspaceAuth middleware on the route group.
|
||||
func (h *WorkspaceHandler) ComputeOptions(c *gin.Context) {
|
||||
c.JSON(200, buildComputeOptions())
|
||||
}
|
||||
|
||||
// Display handles GET /workspaces/:id/display.
|
||||
func (h *WorkspaceHandler) Display(c *gin.Context) {
|
||||
workspaceID := c.Param("id")
|
||||
|
||||
@@ -375,6 +375,103 @@ func TestWithStoredCompute_LoadsComputeForRestartPayloads(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// core#2489: the allowlist (validation set) MUST be derived from the ordered
|
||||
// lists the canvas renders, so the UI and the backend can never disagree about
|
||||
// which (provider, instance-type) pairs are valid. This pins that the derived
|
||||
// set exactly matches the ordered source — adding to one without the other fails.
|
||||
func TestComputeOptions_AllowlistDerivedFromOrderedSSOT(t *testing.T) {
|
||||
// Every ordered instance type is in the validation set (and vice-versa).
|
||||
for provider, types := range workspaceComputeInstanceTypesOrdered {
|
||||
set, ok := workspaceComputeInstanceAllowlist[provider]
|
||||
if !ok {
|
||||
t.Fatalf("allowlist missing provider %q present in ordered SSOT", provider)
|
||||
}
|
||||
if len(set) != len(types) {
|
||||
t.Fatalf("provider %q: ordered list (%d) and allowlist set (%d) drifted", provider, len(types), len(set))
|
||||
}
|
||||
for _, it := range types {
|
||||
if _, ok := set[it]; !ok {
|
||||
t.Fatalf("provider %q: ordered instance %q missing from validation allowlist", provider, it)
|
||||
}
|
||||
}
|
||||
}
|
||||
// No extra providers in the set that aren't in the ordered list.
|
||||
if len(workspaceComputeInstanceAllowlist) != len(workspaceComputeInstanceTypesOrdered) {
|
||||
t.Fatalf("allowlist has providers not present in the ordered SSOT")
|
||||
}
|
||||
// Provider allowlist derived from the ordered providers.
|
||||
if len(workspaceComputeProviderAllowlist) != len(workspaceComputeProvidersOrdered) {
|
||||
t.Fatalf("provider allowlist (%d) drifted from ordered providers (%d)", len(workspaceComputeProviderAllowlist), len(workspaceComputeProvidersOrdered))
|
||||
}
|
||||
for _, p := range workspaceComputeProvidersOrdered {
|
||||
if _, ok := workspaceComputeProviderAllowlist[p]; !ok {
|
||||
t.Fatalf("provider allowlist missing ordered provider %q", p)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// core#2489: the per-provider defaults the canvas pre-selects on a provider switch
|
||||
// MUST themselves be valid instance types for that provider — otherwise the switch
|
||||
// produces a PATCH the backend immediately rejects.
|
||||
func TestComputeOptions_DefaultsAreValidForTheirProvider(t *testing.T) {
|
||||
for provider, def := range workspaceComputeDefaultInstanceByProvider {
|
||||
if !instanceTypeAllowedForProvider(provider, def) {
|
||||
t.Errorf("default instance %q for provider %q is not in that provider's allowlist", def, provider)
|
||||
}
|
||||
}
|
||||
// Every provider must have a default (so the switch never lands on "").
|
||||
for _, p := range workspaceComputeProvidersOrdered {
|
||||
if workspaceComputeDefaultInstanceByProvider[p] == "" {
|
||||
t.Errorf("provider %q has no default instance type", p)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// core#2489: the GET /compute-options endpoint returns exactly the SSOT data the
|
||||
// canvas renders dropdowns from. Every (provider, instance-type) it advertises
|
||||
// MUST pass validateWorkspaceCompute — the whole point of the consolidation.
|
||||
func TestWorkspaceComputeOptions_ReturnsSSOTAndEveryOptionValidates(t *testing.T) {
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-opts"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-opts/compute-options", nil)
|
||||
|
||||
handler.ComputeOptions(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected status 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp workspaceComputeOptionsResponse
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("failed to parse compute-options response: %v", err)
|
||||
}
|
||||
|
||||
// AWS first (default) in the provider order.
|
||||
if len(resp.Providers) == 0 || resp.Providers[0] != "aws" {
|
||||
t.Fatalf("providers = %v, want aws first", resp.Providers)
|
||||
}
|
||||
// Every advertised (provider, instance-type) must pass backend validation.
|
||||
for _, provider := range resp.Providers {
|
||||
types, ok := resp.InstanceTypes[provider]
|
||||
if !ok || len(types) == 0 {
|
||||
t.Fatalf("compute-options advertised provider %q with no instance types", provider)
|
||||
}
|
||||
for _, it := range types {
|
||||
if !instanceTypeAllowedForProvider(provider, it) {
|
||||
t.Errorf("compute-options advertised %q/%q which the backend rejects (DRIFT)", provider, it)
|
||||
}
|
||||
}
|
||||
def := resp.Defaults[provider]
|
||||
if def == "" {
|
||||
t.Errorf("compute-options missing default for provider %q", provider)
|
||||
} else if !instanceTypeAllowedForProvider(provider, def) {
|
||||
t.Errorf("compute-options default %q for %q fails backend validation", def, provider)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkspaceDisplay_NonDisplayWorkspaceReturnsUnavailable(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/events"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provlog"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
@@ -393,7 +394,7 @@ func (h *WorkspaceHandler) restartRuntimeFromConfig(ctx context.Context, id, wsN
|
||||
return dbRuntime
|
||||
}
|
||||
containerRuntime := dbRuntime
|
||||
containerName := configDirName(id) // ws-{id[:12]}
|
||||
containerName := provisioner.ContainerName(id) // ws-{id} (KI-013 full UUID)
|
||||
if cfgBytes, readErr := h.provisioner.ExecRead(ctx, containerName, "/configs/config.yaml"); readErr == nil {
|
||||
for _, line := range strings.Split(string(cfgBytes), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
|
||||
@@ -227,3 +227,58 @@ func TestCacheKey_SlugSeparator(t *testing.T) {
|
||||
t.Errorf("cacheKey collides on ambiguous splits")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTenantSlug(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_SLUG", "acme-corp")
|
||||
if got := tenantSlug(); got != "acme-corp" {
|
||||
t.Errorf("tenantSlug() = %q, want %q", got, "acme-corp")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTenantSlug_TrimSpace(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_SLUG", " spaced-slug ")
|
||||
if got := tenantSlug(); got != "spaced-slug" {
|
||||
t.Errorf("tenantSlug() = %q, want %q", got, "spaced-slug")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTenantSlug_Empty(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_SLUG", "")
|
||||
if got := tenantSlug(); got != "" {
|
||||
t.Errorf("tenantSlug() = %q, want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPSessionVerifyURL(t *testing.T) {
|
||||
t.Setenv("CP_UPSTREAM_URL", "https://cp.test")
|
||||
got := cpSessionVerifyURL("acme")
|
||||
want := "https://cp.test/cp/auth/tenant-member?slug=acme"
|
||||
if got != want {
|
||||
t.Errorf("cpSessionVerifyURL() = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPSessionVerifyURL_TrailingSlash(t *testing.T) {
|
||||
t.Setenv("CP_UPSTREAM_URL", "https://cp.test/")
|
||||
got := cpSessionVerifyURL("acme")
|
||||
want := "https://cp.test/cp/auth/tenant-member?slug=acme"
|
||||
if got != want {
|
||||
t.Errorf("cpSessionVerifyURL() = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPSessionVerifyURL_EscapeSlug(t *testing.T) {
|
||||
t.Setenv("CP_UPSTREAM_URL", "https://cp.test")
|
||||
got := cpSessionVerifyURL("acme corp")
|
||||
want := "https://cp.test/cp/auth/tenant-member?slug=acme+corp"
|
||||
if got != want {
|
||||
t.Errorf("cpSessionVerifyURL() = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPSessionVerifyURL_NoCPConfigured(t *testing.T) {
|
||||
t.Setenv("CP_UPSTREAM_URL", "")
|
||||
if got := cpSessionVerifyURL("acme"); got != "" {
|
||||
t.Errorf("cpSessionVerifyURL() = %q, want empty when CP_UPSTREAM_URL unset", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,8 +168,17 @@ type cpProvisionRequest struct {
|
||||
// DataPersistence is the per-workspace durable-data choice (internal#734);
|
||||
// CP validates the enum at its provision edge and resolves the data volume
|
||||
// from it. Empty = auto (omitted on the wire).
|
||||
DataPersistence string `json:"data_persistence,omitempty"`
|
||||
Display WorkspaceDisplayConfig `json:"display,omitempty"`
|
||||
DataPersistence string `json:"data_persistence,omitempty"`
|
||||
// Kind forwards the workspace kind ("" / "workspace" ordinary, "platform"
|
||||
// = the org concierge) so the CP can select the platform-agent image
|
||||
// variant — the SaaS mirror of the local Docker provisioner's kind-driven
|
||||
// image preference (RFC docs/design/rfc-platform-agent.md; core#2495 SSOT:
|
||||
// the concierge is a normal workspace provisioned through this same path,
|
||||
// differing ONLY in image + config overlay). Omitted when empty so the
|
||||
// wire shape is unchanged for ordinary workspaces; an older CP simply
|
||||
// ignores the field.
|
||||
Kind string `json:"kind,omitempty"`
|
||||
Display WorkspaceDisplayConfig `json:"display,omitempty"`
|
||||
PlatformURL string `json:"platform_url"`
|
||||
Env map[string]string `json:"env"`
|
||||
// ConfigFiles are template + generated config files to write into the
|
||||
@@ -262,6 +271,7 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
|
||||
DiskGB: cfg.DiskGB,
|
||||
DataPersistence: cfg.DataPersistence,
|
||||
Provider: cfg.Provider,
|
||||
Kind: cfg.Kind,
|
||||
Display: cfg.Display,
|
||||
PlatformURL: cfg.PlatformURL,
|
||||
Env: env,
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
package provisioner
|
||||
|
||||
// cp_provisioner_kind_test.go — pins the kind passthrough on the CP provision
|
||||
// wire (core#2495 SSOT): a kind='platform' workspace (the org concierge) is
|
||||
// provisioned through this SAME path as every ordinary workspace, differing
|
||||
// only in the image the CP selects — which requires the CP to KNOW the kind.
|
||||
// Before this field, the CP picked the plain runtime image, the platform MCP
|
||||
// binary was absent, and the concierge hard-failed its MCP readiness gate.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// startCaptureCP spins a fake CP that captures the provision request body and
|
||||
// returns a minimal 201. Returns the provisioner wired at it + the body ptr.
|
||||
func startCaptureCP(t *testing.T) (*CPProvisioner, *[]byte) {
|
||||
t.Helper()
|
||||
var body []byte
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
b, _ := io.ReadAll(r.Body)
|
||||
body = b
|
||||
w.WriteHeader(http.StatusCreated)
|
||||
_, _ = w.Write([]byte(`{"instance_id":"i-test","private_ip":"10.0.0.9","state":"pending"}`))
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
return &CPProvisioner{
|
||||
baseURL: srv.URL,
|
||||
orgID: "org-1",
|
||||
sharedSecret: "s3cret",
|
||||
adminToken: "tok-xyz",
|
||||
httpClient: srv.Client(),
|
||||
}, &body
|
||||
}
|
||||
|
||||
// The concierge: kind='platform' must reach the CP verbatim.
|
||||
func TestStart_ForwardsPlatformKind(t *testing.T) {
|
||||
p, body := startCaptureCP(t)
|
||||
_, err := p.Start(context.Background(), WorkspaceConfig{
|
||||
WorkspaceID: "ws-concierge",
|
||||
Runtime: "claude-code",
|
||||
Kind: WorkspaceKindPlatform,
|
||||
PlatformURL: "https://acme.example.com",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Start: %v", err)
|
||||
}
|
||||
var req map[string]any
|
||||
if err := json.Unmarshal(*body, &req); err != nil {
|
||||
t.Fatalf("unmarshal captured body: %v", err)
|
||||
}
|
||||
if got := req["kind"]; got != "platform" {
|
||||
t.Errorf("kind on the CP wire = %v, want \"platform\" — without it the CP picks the plain runtime image and the concierge loses its platform MCP (core#2495)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// Ordinary workspaces: the wire shape must be UNCHANGED (omitempty) so older
|
||||
// CPs see byte-identical requests.
|
||||
func TestStart_OmitsKindForOrdinaryWorkspace(t *testing.T) {
|
||||
p, body := startCaptureCP(t)
|
||||
_, err := p.Start(context.Background(), WorkspaceConfig{
|
||||
WorkspaceID: "ws-ordinary",
|
||||
Runtime: "claude-code",
|
||||
PlatformURL: "https://acme.example.com",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Start: %v", err)
|
||||
}
|
||||
if strings.Contains(string(*body), `"kind"`) {
|
||||
t.Errorf("ordinary workspace provision body must omit the kind field (omitempty contract), got: %s", string(*body))
|
||||
}
|
||||
}
|
||||
@@ -253,7 +253,7 @@ func TestStart_SendsTemplateAndGeneratedConfigFiles(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(tmpl, "config.yaml"), []byte("name: template\n"), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes), 0o600); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes-100), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.Mkdir(filepath.Join(tmpl, "prompts"), 0o700); err != nil {
|
||||
@@ -378,7 +378,7 @@ func TestStart_CollectsConfigFiles(t *testing.T) {
|
||||
}
|
||||
// adapter.py is within the size limit but is NOT config.yaml or prompts/,
|
||||
// so isCPTemplateConfigFile must exclude it from the transport.
|
||||
if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes), 0o600); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(tmpl, "adapter.py"), bytes.Repeat([]byte("x"), cpConfigFilesMaxBytes-100), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/api/types/filters"
|
||||
dockerimage "github.com/docker/docker/api/types/image"
|
||||
@@ -198,6 +199,11 @@ const (
|
||||
|
||||
// ConfigVolumeName returns the Docker named volume for a workspace's configs.
|
||||
func ConfigVolumeName(workspaceID string) string {
|
||||
return fmt.Sprintf("ws-%s-configs", workspaceID)
|
||||
}
|
||||
|
||||
// legacyConfigVolumeName returns the pre-KI-013 truncated config volume name.
|
||||
func legacyConfigVolumeName(workspaceID string) string {
|
||||
id := workspaceID
|
||||
if len(id) > 12 {
|
||||
id = id[:12]
|
||||
@@ -210,6 +216,11 @@ func ConfigVolumeName(workspaceID string) string {
|
||||
// config volume so it can be discarded independently (via WORKSPACE_RESET_SESSION
|
||||
// or ?reset=true) without wiping the user's config. Issue #12.
|
||||
func ClaudeSessionVolumeName(workspaceID string) string {
|
||||
return fmt.Sprintf("ws-%s-claude-sessions", workspaceID)
|
||||
}
|
||||
|
||||
// legacyClaudeSessionVolumeName returns the pre-KI-013 truncated session volume name.
|
||||
func legacyClaudeSessionVolumeName(workspaceID string) string {
|
||||
id := workspaceID
|
||||
if len(id) > 12 {
|
||||
id = id[:12]
|
||||
@@ -218,8 +229,33 @@ func ClaudeSessionVolumeName(workspaceID string) string {
|
||||
}
|
||||
|
||||
// Provisioner manages Docker containers for workspace agents.
|
||||
// dockerClient is the subset of client.Client methods used by Provisioner.
|
||||
// Declared as an interface so tests can inject fakes without a real daemon.
|
||||
type dockerClient interface {
|
||||
Close() error
|
||||
ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error)
|
||||
ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error)
|
||||
ContainerExecCreate(ctx context.Context, container string, config container.ExecOptions) (container.ExecCreateResponse, error)
|
||||
ContainerInspect(ctx context.Context, container string) (container.InspectResponse, error)
|
||||
ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error)
|
||||
ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error)
|
||||
ContainerRemove(ctx context.Context, container string, options container.RemoveOptions) error
|
||||
ContainerStart(ctx context.Context, container string, options container.StartOptions) error
|
||||
ContainerWait(ctx context.Context, container string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error)
|
||||
CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error
|
||||
ImageInspect(ctx context.Context, image string, opts ...client.ImageInspectOption) (dockerimage.InspectResponse, error)
|
||||
ImagePull(ctx context.Context, ref string, opts dockerimage.PullOptions) (io.ReadCloser, error)
|
||||
VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error)
|
||||
VolumeInspect(ctx context.Context, volumeID string) (volume.Volume, error)
|
||||
VolumeRemove(ctx context.Context, volumeID string, force bool) error
|
||||
}
|
||||
|
||||
// Compile-time assertion that *client.Client satisfies dockerClient.
|
||||
var _ dockerClient = (*client.Client)(nil)
|
||||
|
||||
type Provisioner struct {
|
||||
cli *client.Client
|
||||
cli dockerClient
|
||||
dockerCli *client.Client
|
||||
}
|
||||
|
||||
// New creates a new Provisioner connected to the local Docker daemon.
|
||||
@@ -228,11 +264,17 @@ func New() (*Provisioner, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to connect to Docker: %w", err)
|
||||
}
|
||||
return &Provisioner{cli: cli}, nil
|
||||
return &Provisioner{cli: cli, dockerCli: cli}, nil
|
||||
}
|
||||
|
||||
// ContainerName returns the Docker container name for a workspace.
|
||||
func ContainerName(workspaceID string) string {
|
||||
return fmt.Sprintf("ws-%s", workspaceID)
|
||||
}
|
||||
|
||||
// legacyContainerName returns the pre-KI-013 truncated container name.
|
||||
// Used only for backward-compatible lookups during the deploy transition.
|
||||
func legacyContainerName(workspaceID string) string {
|
||||
id := workspaceID
|
||||
if len(id) > 12 {
|
||||
id = id[:12]
|
||||
@@ -474,7 +516,9 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
return "", ErrNoBackend
|
||||
}
|
||||
name := ContainerName(cfg.WorkspaceID)
|
||||
configVolume := ConfigVolumeName(cfg.WorkspaceID)
|
||||
// KI-013 deploy safety: prefer legacy truncated config volume if it
|
||||
// already exists, so pre-deploy workspace data is not orphaned.
|
||||
configVolume := p.resolveConfigVolumeName(ctx, cfg.WorkspaceID)
|
||||
|
||||
// Create named volume for configs (idempotent — no-op if already exists)
|
||||
_, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{
|
||||
@@ -569,7 +613,9 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
// remove the existing volume before recreating it, so the agent
|
||||
// boots with a clean session dir.
|
||||
if cfg.Runtime == "claude-code" {
|
||||
claudeSessionsVolume := ClaudeSessionVolumeName(cfg.WorkspaceID)
|
||||
// KI-013 deploy safety: prefer legacy truncated session volume if it
|
||||
// already exists, so pre-deploy session data is not orphaned.
|
||||
claudeSessionsVolume := p.resolveClaudeSessionVolumeName(ctx, cfg.WorkspaceID)
|
||||
resetEnv, _ := strconv.ParseBool(cfg.EnvVars["WORKSPACE_RESET_SESSION"])
|
||||
if cfg.ResetClaudeSession || resetEnv {
|
||||
if rmErr := p.cli.VolumeRemove(ctx, claudeSessionsVolume, true); rmErr != nil {
|
||||
@@ -1288,7 +1334,7 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t
|
||||
if p == nil || p.cli == nil {
|
||||
return ErrNoBackend
|
||||
}
|
||||
volName := ConfigVolumeName(workspaceID)
|
||||
volName := p.resolveConfigVolumeName(ctx, workspaceID)
|
||||
resp, err := p.cli.ContainerCreate(ctx, &container.Config{
|
||||
Image: "alpine",
|
||||
Cmd: []string{"sh", "-c", writeAuthTokenVolumeCmd()},
|
||||
@@ -1315,6 +1361,33 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolveConfigVolumeName returns the effective config volume name for a
|
||||
// workspace, preferring the legacy truncated name if that volume already
|
||||
// exists (KI-013 deploy safety: pre-deploy volumes must not be orphaned).
|
||||
func (p *Provisioner) resolveConfigVolumeName(ctx context.Context, workspaceID string) string {
|
||||
if p == nil || p.cli == nil {
|
||||
return ConfigVolumeName(workspaceID)
|
||||
}
|
||||
legacy := legacyConfigVolumeName(workspaceID)
|
||||
if _, err := p.cli.VolumeInspect(ctx, legacy); err == nil {
|
||||
return legacy
|
||||
}
|
||||
return ConfigVolumeName(workspaceID)
|
||||
}
|
||||
|
||||
// resolveClaudeSessionVolumeName returns the effective claude-sessions volume
|
||||
// name, preferring the legacy truncated name if that volume already exists.
|
||||
func (p *Provisioner) resolveClaudeSessionVolumeName(ctx context.Context, workspaceID string) string {
|
||||
if p == nil || p.cli == nil {
|
||||
return ClaudeSessionVolumeName(workspaceID)
|
||||
}
|
||||
legacy := legacyClaudeSessionVolumeName(workspaceID)
|
||||
if _, err := p.cli.VolumeInspect(ctx, legacy); err == nil {
|
||||
return legacy
|
||||
}
|
||||
return ClaudeSessionVolumeName(workspaceID)
|
||||
}
|
||||
|
||||
// RemoveVolume removes the config volume for a workspace.
|
||||
// Also removes the claude-sessions volume (best-effort, may not exist
|
||||
// for non claude-code runtimes). Issue #12.
|
||||
@@ -1322,16 +1395,22 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
|
||||
if p == nil || p.cli == nil {
|
||||
return ErrNoBackend
|
||||
}
|
||||
volName := ConfigVolumeName(workspaceID)
|
||||
if err := p.cli.VolumeRemove(ctx, volName, true); err != nil {
|
||||
return fmt.Errorf("failed to remove volume %s: %w", volName, err)
|
||||
// KI-013 deploy safety: remove both new full-ID name and legacy
|
||||
// truncated name if present, so pre-deploy volumes are not orphaned.
|
||||
removed := false
|
||||
for _, volName := range []string{ConfigVolumeName(workspaceID), legacyConfigVolumeName(workspaceID)} {
|
||||
if err := p.cli.VolumeRemove(ctx, volName, true); err == nil {
|
||||
log.Printf("Provisioner: removed config volume %s", volName)
|
||||
removed = true
|
||||
}
|
||||
}
|
||||
log.Printf("Provisioner: removed config volume %s", volName)
|
||||
csName := ClaudeSessionVolumeName(workspaceID)
|
||||
if rmErr := p.cli.VolumeRemove(ctx, csName, true); rmErr != nil {
|
||||
log.Printf("Provisioner: claude-sessions volume cleanup warning for %s: %v", csName, rmErr)
|
||||
} else {
|
||||
log.Printf("Provisioner: removed claude-sessions volume %s", csName)
|
||||
if !removed {
|
||||
return fmt.Errorf("failed to remove config volume for %s", workspaceID)
|
||||
}
|
||||
for _, csName := range []string{ClaudeSessionVolumeName(workspaceID), legacyClaudeSessionVolumeName(workspaceID)} {
|
||||
if rmErr := p.cli.VolumeRemove(ctx, csName, true); rmErr == nil {
|
||||
log.Printf("Provisioner: removed claude-sessions volume %s", csName)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1354,37 +1433,34 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
if p == nil || p.cli == nil {
|
||||
return ErrNoBackend
|
||||
}
|
||||
name := ContainerName(workspaceID)
|
||||
|
||||
// Force-remove kills and removes in one atomic operation, bypassing
|
||||
// the restart policy entirely.
|
||||
err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
|
||||
if err == nil {
|
||||
log.Printf("Provisioner: stopped and removed container %s", name)
|
||||
return nil
|
||||
// KI-013 deploy safety: try new full-ID name first, then fall back to
|
||||
// the old truncated name so pre-deploy containers are still stoppable.
|
||||
names := []string{ContainerName(workspaceID), legacyContainerName(workspaceID)}
|
||||
for _, name := range names {
|
||||
// Force-remove kills and removes in one atomic operation, bypassing
|
||||
// the restart policy entirely.
|
||||
err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
|
||||
if err == nil {
|
||||
log.Printf("Provisioner: stopped and removed container %s", name)
|
||||
return nil
|
||||
}
|
||||
if isContainerNotFound(err) {
|
||||
// Try the next name (legacy fallback). If both miss, the
|
||||
// container is genuinely gone — post-condition satisfied.
|
||||
continue
|
||||
}
|
||||
if isRemovalInProgress(err) {
|
||||
// Another concurrent caller is already removing this container.
|
||||
log.Printf("Provisioner: container %s removal already in progress (no-op)", name)
|
||||
return nil
|
||||
}
|
||||
// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
|
||||
log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
|
||||
return fmt.Errorf("force-remove %s: %w", name, err)
|
||||
}
|
||||
if isContainerNotFound(err) {
|
||||
// Container was already gone — the post-condition we want is
|
||||
// satisfied. Don't surface as an error.
|
||||
log.Printf("Provisioner: container %s already gone (no-op)", name)
|
||||
return nil
|
||||
}
|
||||
if isRemovalInProgress(err) {
|
||||
// Another concurrent caller (orphan sweeper, sibling cascade
|
||||
// delete, manual `docker rm -f`) is already removing this
|
||||
// container. The post-condition is the same as success: the
|
||||
// container WILL be gone shortly. Surfacing this as a 500 on
|
||||
// cascade-delete causes UI confusion ("workspace marked
|
||||
// removed, but stop call(s) failed — please retry") even
|
||||
// though retrying would just race the same in-flight removal.
|
||||
log.Printf("Provisioner: container %s removal already in progress (no-op)", name)
|
||||
return nil
|
||||
}
|
||||
// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
|
||||
// Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce)
|
||||
// must propagate this so they can skip the follow-up RemoveVolume.
|
||||
log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
|
||||
return fmt.Errorf("force-remove %s: %w", name, err)
|
||||
// Both names missed — container was already gone.
|
||||
log.Printf("Provisioner: container %s already gone (no-op)", ContainerName(workspaceID))
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsRunning checks if a workspace container is currently running.
|
||||
@@ -1440,20 +1516,39 @@ func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool,
|
||||
// transient errors into the same "" return as a genuinely-stopped container.
|
||||
// That hid daemon flakes as misleading 503 "container not running" responses
|
||||
// AND let the two impls drift on edge-case behavior. This is the SSOT.
|
||||
func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID string) (string, error) {
|
||||
// isNilDockerClient reports whether cli is nil or a typed nil pointer
|
||||
// (e.g. (*client.Client)(nil) passed as a dockerClient interface value).
|
||||
// Required because a non-nil interface holding a nil pointer does not == nil.
|
||||
func isNilDockerClient(cli dockerClient) bool {
|
||||
if cli == nil {
|
||||
return true
|
||||
}
|
||||
switch c := cli.(type) {
|
||||
case *client.Client:
|
||||
return c == nil
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func RunningContainerName(ctx context.Context, cli dockerClient, workspaceID string) (string, error) {
|
||||
if isNilDockerClient(cli) {
|
||||
return "", ErrNoBackend
|
||||
}
|
||||
name := ContainerName(workspaceID)
|
||||
info, err := cli.ContainerInspect(ctx, name)
|
||||
if err != nil {
|
||||
if isContainerNotFound(err) {
|
||||
return "", nil
|
||||
// KI-013 deploy safety: new full-ID name first, then fall back to the
|
||||
// old truncated name so pre-deploy containers are still discoverable.
|
||||
names := []string{ContainerName(workspaceID), legacyContainerName(workspaceID)}
|
||||
for _, name := range names {
|
||||
info, err := cli.ContainerInspect(ctx, name)
|
||||
if err != nil {
|
||||
if isContainerNotFound(err) {
|
||||
continue
|
||||
}
|
||||
return "", err
|
||||
}
|
||||
if info.State.Running {
|
||||
return name, nil
|
||||
}
|
||||
return "", err
|
||||
}
|
||||
if info.State.Running {
|
||||
return name, nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
@@ -1505,7 +1600,7 @@ func isRemovalInProgress(err error) bool {
|
||||
|
||||
// DockerClient returns the underlying Docker client for sharing with other handlers.
|
||||
func (p *Provisioner) DockerClient() *client.Client {
|
||||
return p.cli
|
||||
return p.dockerCli
|
||||
}
|
||||
|
||||
// Close cleans up the Docker client.
|
||||
|
||||
@@ -0,0 +1,360 @@
|
||||
package provisioner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"testing"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
"github.com/docker/docker/api/types/container"
|
||||
dockerimage "github.com/docker/docker/api/types/image"
|
||||
"github.com/docker/docker/api/types/network"
|
||||
"github.com/docker/docker/api/types/volume"
|
||||
"github.com/docker/docker/client"
|
||||
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||
)
|
||||
|
||||
// fakeDockerClient is a test double that implements dockerClient.
|
||||
// It records every call and holds lightweight in-memory state for volumes
|
||||
// and containers so tests can assert behaviour without a real daemon.
|
||||
type fakeDockerClient struct {
|
||||
volumes map[string]volume.Volume
|
||||
containers map[string]container.InspectResponse
|
||||
removeErr map[string]error
|
||||
inspectErr map[string]error
|
||||
calls []string
|
||||
}
|
||||
|
||||
func newFakeDockerClient() *fakeDockerClient {
|
||||
return &fakeDockerClient{
|
||||
volumes: make(map[string]volume.Volume),
|
||||
containers: make(map[string]container.InspectResponse),
|
||||
removeErr: make(map[string]error),
|
||||
inspectErr: make(map[string]error),
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) record(format string, args ...interface{}) {
|
||||
f.calls = append(f.calls, fmt.Sprintf(format, args...))
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) Close() error { return nil }
|
||||
|
||||
func (f *fakeDockerClient) ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error) {
|
||||
f.record("ContainerCreate:%s", containerName)
|
||||
return container.CreateResponse{}, nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error) {
|
||||
panic("ContainerExecAttach not expected")
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerExecCreate(ctx context.Context, container string, config container.ExecOptions) (container.ExecCreateResponse, error) {
|
||||
panic("ContainerExecCreate not expected")
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerInspect(ctx context.Context, name string) (container.InspectResponse, error) {
|
||||
f.record("ContainerInspect:%s", name)
|
||||
if err, ok := f.inspectErr[name]; ok {
|
||||
return container.InspectResponse{}, err
|
||||
}
|
||||
c, ok := f.containers[name]
|
||||
if !ok {
|
||||
return container.InspectResponse{}, fmt.Errorf("No such container: %s", name)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error) {
|
||||
panic("ContainerList not expected")
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error) {
|
||||
panic("ContainerLogs not expected")
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerRemove(ctx context.Context, name string, options container.RemoveOptions) error {
|
||||
f.record("ContainerRemove:%s", name)
|
||||
if err, ok := f.removeErr[name]; ok {
|
||||
return err
|
||||
}
|
||||
if _, ok := f.containers[name]; !ok {
|
||||
return fmt.Errorf("No such container: %s", name)
|
||||
}
|
||||
delete(f.containers, name)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerStart(ctx context.Context, name string, options container.StartOptions) error {
|
||||
f.record("ContainerStart:%s", name)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ContainerWait(ctx context.Context, name string, condition container.WaitCondition) (
|
||||
<-chan container.WaitResponse, <-chan error) {
|
||||
f.record("ContainerWait:%s", name)
|
||||
done := make(chan container.WaitResponse, 1)
|
||||
errCh := make(chan error, 1)
|
||||
done <- container.WaitResponse{StatusCode: 0}
|
||||
close(done)
|
||||
close(errCh)
|
||||
return done, errCh
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error {
|
||||
panic("CopyToContainer not expected")
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ImageInspect(ctx context.Context, image string, opts ...client.ImageInspectOption) (dockerimage.InspectResponse, error) {
|
||||
panic("ImageInspect not expected")
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) ImagePull(ctx context.Context, ref string, opts dockerimage.PullOptions) (io.ReadCloser, error) {
|
||||
panic("ImagePull not expected")
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error) {
|
||||
f.record("VolumeCreate:%s", options.Name)
|
||||
f.volumes[options.Name] = volume.Volume{Name: options.Name}
|
||||
return f.volumes[options.Name], nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) VolumeInspect(ctx context.Context, name string) (volume.Volume, error) {
|
||||
f.record("VolumeInspect:%s", name)
|
||||
v, ok := f.volumes[name]
|
||||
if !ok {
|
||||
return volume.Volume{}, fmt.Errorf("No such volume: %s", name)
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
func (f *fakeDockerClient) VolumeRemove(ctx context.Context, name string, force bool) error {
|
||||
f.record("VolumeRemove:%s", name)
|
||||
if _, ok := f.volumes[name]; !ok {
|
||||
return fmt.Errorf("No such volume: %s", name)
|
||||
}
|
||||
delete(f.volumes, name)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Helper: return true if a call matching prefix was recorded.
|
||||
func (f *fakeDockerClient) called(prefix string) bool {
|
||||
for _, c := range f.calls {
|
||||
if c == prefix {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// (a) resolveConfigVolumeName prefers existing legacy volume.
|
||||
func TestResolveConfigVolumeName_LegacyExists(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.volumes[legacyConfigVolumeName("ws-abc123")] = volume.Volume{Name: legacyConfigVolumeName("ws-abc123")}
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
got := p.resolveConfigVolumeName(ctx, "ws-abc123")
|
||||
want := legacyConfigVolumeName("ws-abc123")
|
||||
if got != want {
|
||||
t.Errorf("resolveConfigVolumeName legacy exists: got %q, want %q", got, want)
|
||||
}
|
||||
if !f.called("VolumeInspect:" + legacyConfigVolumeName("ws-abc123")) {
|
||||
t.Errorf("expected VolumeInspect on legacy volume")
|
||||
}
|
||||
}
|
||||
|
||||
// (a) resolveConfigVolumeName returns full-UUID when legacy is absent.
|
||||
func TestResolveConfigVolumeName_LegacyAbsent(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
got := p.resolveConfigVolumeName(ctx, "ws-abc123")
|
||||
want := ConfigVolumeName("ws-abc123")
|
||||
if got != want {
|
||||
t.Errorf("resolveConfigVolumeName legacy absent: got %q, want %q", got, want)
|
||||
}
|
||||
if !f.called("VolumeInspect:" + legacyConfigVolumeName("ws-abc123")) {
|
||||
t.Errorf("expected VolumeInspect on legacy volume")
|
||||
}
|
||||
}
|
||||
|
||||
// (a) resolveClaudeSessionVolumeName prefers existing legacy volume.
|
||||
func TestResolveClaudeSessionVolumeName_LegacyExists(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.volumes[legacyClaudeSessionVolumeName("ws-abc123")] = volume.Volume{Name: legacyClaudeSessionVolumeName("ws-abc123")}
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
got := p.resolveClaudeSessionVolumeName(ctx, "ws-abc123")
|
||||
want := legacyClaudeSessionVolumeName("ws-abc123")
|
||||
if got != want {
|
||||
t.Errorf("resolveClaudeSessionVolumeName legacy exists: got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// (a) resolveClaudeSessionVolumeName returns full-UUID when legacy is absent.
|
||||
func TestResolveClaudeSessionVolumeName_LegacyAbsent(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
got := p.resolveClaudeSessionVolumeName(ctx, "ws-abc123")
|
||||
want := ClaudeSessionVolumeName("ws-abc123")
|
||||
if got != want {
|
||||
t.Errorf("resolveClaudeSessionVolumeName legacy absent: got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// (c) Stop falls back to legacy container when full-ID is absent.
|
||||
func TestStop_FallbackToLegacy(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.containers[legacyContainerName("ws-abc123")] = container.InspectResponse{}
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
if err := p.Stop(ctx, "ws-abc123"); err != nil {
|
||||
t.Fatalf("Stop fallback: unexpected error: %v", err)
|
||||
}
|
||||
if !f.called("ContainerRemove:" + ContainerName("ws-abc123")) {
|
||||
t.Errorf("expected ContainerRemove on full-ID name first")
|
||||
}
|
||||
if !f.called("ContainerRemove:" + legacyContainerName("ws-abc123")) {
|
||||
t.Errorf("expected ContainerRemove on legacy name as fallback")
|
||||
}
|
||||
}
|
||||
|
||||
// (c) Stop returns nil when new name removal is already in progress.
|
||||
func TestStop_RemovalInProgress(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.containers[ContainerName("abcdefghijklmnopqrstuvwxyz")] = container.InspectResponse{}
|
||||
f.removeErr[ContainerName("abcdefghijklmnopqrstuvwxyz")] = fmt.Errorf("removal of container %s is already in progress", ContainerName("abcdefghijklmnopqrstuvwxyz"))
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
if err := p.Stop(ctx, "abcdefghijklmnopqrstuvwxyz"); err != nil {
|
||||
t.Fatalf("Stop removal-in-progress: unexpected error: %v", err)
|
||||
}
|
||||
if f.called("ContainerRemove:" + legacyContainerName("abcdefghijklmnopqrstuvwxyz")) {
|
||||
t.Errorf("did not expect fallback ContainerRemove when removal is in progress")
|
||||
}
|
||||
}
|
||||
|
||||
// (c) Stop surfaces real daemon errors (not not-found, not in-progress).
|
||||
func TestStop_RealError(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.containers[ContainerName("ws-abc123")] = container.InspectResponse{}
|
||||
f.removeErr[ContainerName("ws-abc123")] = fmt.Errorf("daemon timeout")
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
err := p.Stop(ctx, "ws-abc123")
|
||||
if err == nil {
|
||||
t.Fatalf("Stop real error: expected error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// (c) RunningContainerName falls back to legacy when full-ID is absent.
|
||||
func TestRunningContainerName_FallbackToLegacy(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.containers[legacyContainerName("ws-abc123")] = container.InspectResponse{
|
||||
ContainerJSONBase: &container.ContainerJSONBase{State: &container.State{Running: true}},
|
||||
}
|
||||
|
||||
got, err := RunningContainerName(ctx, f, "ws-abc123")
|
||||
if err != nil {
|
||||
t.Fatalf("RunningContainerName fallback: unexpected error: %v", err)
|
||||
}
|
||||
want := legacyContainerName("ws-abc123")
|
||||
if got != want {
|
||||
t.Errorf("RunningContainerName fallback: got %q, want %q", got, want)
|
||||
}
|
||||
if !f.called("ContainerInspect:" + ContainerName("ws-abc123")) {
|
||||
t.Errorf("expected ContainerInspect on full-ID name")
|
||||
}
|
||||
if !f.called("ContainerInspect:" + legacyContainerName("ws-abc123")) {
|
||||
t.Errorf("expected ContainerInspect on legacy name")
|
||||
}
|
||||
}
|
||||
|
||||
// (c) RunningContainerName returns full-ID when it is running.
|
||||
func TestRunningContainerName_FullIDRunning(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.containers[ContainerName("abcdefghijklmnopqrstuvwxyz")] = container.InspectResponse{
|
||||
ContainerJSONBase: &container.ContainerJSONBase{State: &container.State{Running: true}},
|
||||
}
|
||||
|
||||
got, err := RunningContainerName(ctx, f, "abcdefghijklmnopqrstuvwxyz")
|
||||
if err != nil {
|
||||
t.Fatalf("RunningContainerName full-id: unexpected error: %v", err)
|
||||
}
|
||||
want := ContainerName("abcdefghijklmnopqrstuvwxyz")
|
||||
if got != want {
|
||||
t.Errorf("RunningContainerName full-id: got %q, want %q", got, want)
|
||||
}
|
||||
if f.called("ContainerInspect:" + legacyContainerName("abcdefghijklmnopqrstuvwxyz")) {
|
||||
t.Errorf("did not expect legacy inspect when full-id is running")
|
||||
}
|
||||
}
|
||||
|
||||
// (c) RunningContainerName surfaces transient errors (not not-found).
|
||||
func TestRunningContainerName_TransientError(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
f.inspectErr[ContainerName("ws-abc123")] = fmt.Errorf("daemon socket EOF")
|
||||
|
||||
_, err := RunningContainerName(ctx, f, "ws-abc123")
|
||||
if err == nil {
|
||||
t.Fatalf("RunningContainerName transient error: expected error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// (d) RemoveVolume removes only the target workspace's volumes.
|
||||
func TestRemoveVolume_WorkspaceScoped(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
wsA := "ws-abc123"
|
||||
wsB := "ws-def456"
|
||||
f.volumes[ConfigVolumeName(wsA)] = volume.Volume{Name: ConfigVolumeName(wsA)}
|
||||
f.volumes[legacyConfigVolumeName(wsA)] = volume.Volume{Name: legacyConfigVolumeName(wsA)}
|
||||
f.volumes[ConfigVolumeName(wsB)] = volume.Volume{Name: ConfigVolumeName(wsB)}
|
||||
f.volumes[legacyConfigVolumeName(wsB)] = volume.Volume{Name: legacyConfigVolumeName(wsB)}
|
||||
f.volumes[ClaudeSessionVolumeName(wsA)] = volume.Volume{Name: ClaudeSessionVolumeName(wsA)}
|
||||
f.volumes[legacyClaudeSessionVolumeName(wsA)] = volume.Volume{Name: legacyClaudeSessionVolumeName(wsA)}
|
||||
f.volumes[ClaudeSessionVolumeName(wsB)] = volume.Volume{Name: ClaudeSessionVolumeName(wsB)}
|
||||
f.volumes[legacyClaudeSessionVolumeName(wsB)] = volume.Volume{Name: legacyClaudeSessionVolumeName(wsB)}
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
if err := p.RemoveVolume(ctx, wsA); err != nil {
|
||||
t.Fatalf("RemoveVolume scoped: unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// wsA volumes must be gone.
|
||||
for _, v := range []string{ConfigVolumeName(wsA), legacyConfigVolumeName(wsA), ClaudeSessionVolumeName(wsA), legacyClaudeSessionVolumeName(wsA)} {
|
||||
if _, ok := f.volumes[v]; ok {
|
||||
t.Errorf("RemoveVolume scoped: expected %s to be removed", v)
|
||||
}
|
||||
}
|
||||
// wsB volumes must remain.
|
||||
for _, v := range []string{ConfigVolumeName(wsB), legacyConfigVolumeName(wsB), ClaudeSessionVolumeName(wsB), legacyClaudeSessionVolumeName(wsB)} {
|
||||
if _, ok := f.volumes[v]; !ok {
|
||||
t.Errorf("RemoveVolume scoped: expected %s to remain", v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// (d) RemoveVolume returns error when neither new nor legacy config volume exists.
|
||||
func TestRemoveVolume_BothMissing(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
f := newFakeDockerClient()
|
||||
|
||||
p := &Provisioner{cli: f}
|
||||
err := p.RemoveVolume(ctx, "abcdefghijklmnopqrstuvwxyz")
|
||||
if err == nil {
|
||||
t.Fatalf("RemoveVolume both missing: expected error, got nil")
|
||||
}
|
||||
}
|
||||
@@ -425,7 +425,7 @@ func TestContainerName(t *testing.T) {
|
||||
}{
|
||||
{"short", "ws-short"},
|
||||
{"exactly12ch", "ws-exactly12ch"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than-"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than-twelve-characters"},
|
||||
{"abc", "ws-abc"},
|
||||
}
|
||||
|
||||
@@ -437,6 +437,17 @@ func TestContainerName(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestContainerName_DistinctSamePrefix12 is a regression guard for KI-013:
|
||||
// two UUIDs sharing the same first 12 characters must produce distinct
|
||||
// container names (the old 12-char truncation caused collisions).
|
||||
func TestContainerName_DistinctSamePrefix12(t *testing.T) {
|
||||
id1 := "123456789abc-4def-1234-567890abcdef"
|
||||
id2 := "123456789abc-4def-1234-567890abcdf0"
|
||||
if ContainerName(id1) == ContainerName(id2) {
|
||||
t.Fatalf("ContainerName must differ for same-first-12 UUIDs: both = %q", ContainerName(id1))
|
||||
}
|
||||
}
|
||||
|
||||
// TestConfigVolumeName verifies config volume naming.
|
||||
func TestConfigVolumeName(t *testing.T) {
|
||||
tests := []struct {
|
||||
@@ -445,7 +456,7 @@ func TestConfigVolumeName(t *testing.T) {
|
||||
}{
|
||||
{"short", "ws-short-configs"},
|
||||
{"exactly12ch", "ws-exactly12ch-configs"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than--configs"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than-twelve-characters-configs"},
|
||||
{"abc", "ws-abc-configs"},
|
||||
}
|
||||
|
||||
@@ -457,10 +468,19 @@ func TestConfigVolumeName(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestConfigVolumeName_DistinctSamePrefix12 is a regression guard for KI-013.
|
||||
func TestConfigVolumeName_DistinctSamePrefix12(t *testing.T) {
|
||||
id1 := "123456789abc-4def-1234-567890abcdef"
|
||||
id2 := "123456789abc-4def-1234-567890abcdf0"
|
||||
if ConfigVolumeName(id1) == ConfigVolumeName(id2) {
|
||||
t.Fatalf("ConfigVolumeName must differ for same-first-12 UUIDs: both = %q", ConfigVolumeName(id1))
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- #12 — claude-sessions volume naming ----------
|
||||
|
||||
// TestClaudeSessionVolumeName_Deterministic: same ID → same volume name, and
|
||||
// the name follows the ws-<id[:12]>-claude-sessions shape used everywhere
|
||||
// the name follows the ws-<id>-claude-sessions shape used everywhere
|
||||
// else in the provisioner.
|
||||
func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
|
||||
tests := []struct {
|
||||
@@ -469,7 +489,7 @@ func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
|
||||
}{
|
||||
{"short", "ws-short-claude-sessions"},
|
||||
{"exactly12ch", "ws-exactly12ch-claude-sessions"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than--claude-sessions"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than-twelve-characters-claude-sessions"},
|
||||
{"abc", "ws-abc-claude-sessions"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
@@ -484,6 +504,15 @@ func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestClaudeSessionVolumeName_DistinctSamePrefix12 is a regression guard for KI-013.
|
||||
func TestClaudeSessionVolumeName_DistinctSamePrefix12(t *testing.T) {
|
||||
id1 := "123456789abc-4def-1234-567890abcdef"
|
||||
id2 := "123456789abc-4def-1234-567890abcdf0"
|
||||
if ClaudeSessionVolumeName(id1) == ClaudeSessionVolumeName(id2) {
|
||||
t.Fatalf("ClaudeSessionVolumeName must differ for same-first-12 UUIDs: both = %q", ClaudeSessionVolumeName(id1))
|
||||
}
|
||||
}
|
||||
|
||||
// TestClaudeSessionVolumeName_DistinctFromConfig ensures we never alias the
|
||||
// claude-sessions volume onto the config volume (deleting one must not wipe
|
||||
// the other in RemoveVolume's cleanup path).
|
||||
|
||||
@@ -234,6 +234,13 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
|
||||
// this specific workspace, or a control-plane-verified tenant session.
|
||||
wsAuth.PATCH("", wh.Update)
|
||||
|
||||
// Compute options — SSOT for the canvas Container-Config tab's cloud-
|
||||
// provider + instance-type dropdowns (core#2489). Returns the same
|
||||
// provider/instance metadata validateWorkspaceCompute enforces, so the UI
|
||||
// can never offer a (provider, instance-type) the PATCH then rejects with
|
||||
// a 400. Static (derived from the in-binary allowlist) — no DB round-trip.
|
||||
wsAuth.GET("/compute-options", wh.ComputeOptions)
|
||||
|
||||
// Lifecycle
|
||||
wsAuth.GET("/state", wh.State)
|
||||
wsAuth.POST("/restart", wh.Restart)
|
||||
|
||||
@@ -1163,3 +1163,109 @@ func TestSanitizeUTF8(t *testing.T) {
|
||||
t.Errorf("sanitizeUTF8 did not produce valid UTF-8: %x", []byte(out))
|
||||
}
|
||||
}
|
||||
|
||||
// ── TestClassifyTaskState ───────────────────────────────────────────────────
|
||||
|
||||
func TestClassifyTaskState_NoStatus(t *testing.T) {
|
||||
result := map[string]json.RawMessage{"other": json.RawMessage(`"x"`)}
|
||||
if got := classifyTaskState(result); got != "" {
|
||||
t.Errorf("classifyTaskState(no status) = %q, want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyTaskState_OKStates(t *testing.T) {
|
||||
for _, state := range []string{"", "submitted", "working", "completed"} {
|
||||
result := map[string]json.RawMessage{
|
||||
"status": json.RawMessage(`{"state":"` + state + `"}`),
|
||||
}
|
||||
if got := classifyTaskState(result); got != "" {
|
||||
t.Errorf("classifyTaskState(%q) = %q, want empty (OK state)", state, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyTaskState_FailureState(t *testing.T) {
|
||||
result := map[string]json.RawMessage{
|
||||
"status": json.RawMessage(`{"state":"failed"}`),
|
||||
}
|
||||
if got := classifyTaskState(result); got != "failed" {
|
||||
t.Errorf("classifyTaskState(failed) = %q, want failed", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyTaskState_MalformedStatus(t *testing.T) {
|
||||
result := map[string]json.RawMessage{
|
||||
"status": json.RawMessage(`{broken`),
|
||||
}
|
||||
if got := classifyTaskState(result); got != "" {
|
||||
t.Errorf("classifyTaskState(malformed) = %q, want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
// ── TestIsEmptyResponse ─────────────────────────────────────────────────────
|
||||
|
||||
func TestIsEmptyResponse_EmptyBody(t *testing.T) {
|
||||
if !isEmptyResponse([]byte{}) {
|
||||
t.Error("isEmptyResponse(empty) should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsEmptyResponse_NoResponseGenerated(t *testing.T) {
|
||||
if !isEmptyResponse([]byte(`(no response generated)`)) {
|
||||
t.Error("isEmptyResponse(no-response-generated) should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsEmptyResponse_TextFieldEmpty(t *testing.T) {
|
||||
if !isEmptyResponse([]byte(`{"result":{"parts":[{"text":""}]}}`)) {
|
||||
t.Error("isEmptyResponse(empty text field) should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsEmptyResponse_TextFieldNoResponse(t *testing.T) {
|
||||
if !isEmptyResponse([]byte(`{"result":{"parts":[{"text":"(no response generated)"}]}}`)) {
|
||||
t.Error("isEmptyResponse(text=no-response-generated) should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsEmptyResponse_HasContent(t *testing.T) {
|
||||
if isEmptyResponse([]byte(`{"result":{"parts":[{"text":"hello"}]}}`)) {
|
||||
t.Error("isEmptyResponse(with content) should be false")
|
||||
}
|
||||
}
|
||||
|
||||
// ── TestA2AErrorFromBody ────────────────────────────────────────────────────
|
||||
|
||||
func TestA2AErrorFromBody_Empty(t *testing.T) {
|
||||
if got := a2aErrorFromBody([]byte{}); got != "" {
|
||||
t.Errorf("a2aErrorFromBody(empty) = %q, want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestA2AErrorFromBody_JSONRPCMessage(t *testing.T) {
|
||||
body := []byte(`{"error":{"code":-32603,"message":"internal error"}}`)
|
||||
if got := a2aErrorFromBody(body); got != "internal error" {
|
||||
t.Errorf("a2aErrorFromBody(JSON-RPC) = %q, want internal error", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestA2AErrorFromBody_PlainString(t *testing.T) {
|
||||
body := []byte(`{"error":"something went wrong"}`)
|
||||
if got := a2aErrorFromBody(body); got != "something went wrong" {
|
||||
t.Errorf("a2aErrorFromBody(plain) = %q, want something went wrong", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestA2AErrorFromBody_NoError(t *testing.T) {
|
||||
body := []byte(`{"result":"ok"}`)
|
||||
if got := a2aErrorFromBody(body); got != "" {
|
||||
t.Errorf("a2aErrorFromBody(no error) = %q, want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestA2AErrorFromBody_InvalidJSON(t *testing.T) {
|
||||
body := []byte(`{broken`)
|
||||
if got := a2aErrorFromBody(body); got != "" {
|
||||
t.Errorf("a2aErrorFromBody(invalid) = %q, want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user