Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 130f48ed69 | |||
| b0cac02702 |
@@ -66,14 +66,6 @@ def build_plan(env: dict[str, str]) -> dict:
|
||||
"target_tag": target_tag,
|
||||
"soak_seconds": _int_env(env, "PROD_AUTO_DEPLOY_SOAK_SECONDS", 60, minimum=0),
|
||||
"batch_size": _int_env(env, "PROD_AUTO_DEPLOY_BATCH_SIZE", 3),
|
||||
# Tolerate a small minority of individually-stuck tenants (e.g. a wedged
|
||||
# data volume that won't recreate). They are QUARANTINED — shipped past
|
||||
# so the healthy majority still lands the build — and reported for
|
||||
# separate recovery, instead of one stuck tenant blocking the whole
|
||||
# fleet deploy. The canary still must pass, the CP halts a batch the
|
||||
# moment failures exceed this, and the cross-batch coverage gate below
|
||||
# enforces the same tolerance globally. Default 1.
|
||||
"max_stragglers": _int_env(env, "PROD_AUTO_DEPLOY_MAX_STRAGGLERS", 1, minimum=0),
|
||||
"dry_run": truthy_flag(env.get("PROD_AUTO_DEPLOY_DRY_RUN", "")),
|
||||
# confirm:true ack required by CP /cp/admin/tenants/redeploy-fleet
|
||||
# contract (cp#228 / task #308) for fleet-wide intent. Empty body
|
||||
@@ -259,41 +251,26 @@ def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]:
|
||||
return sorted(s for s in dict.fromkeys(enumerated) if s not in verified)
|
||||
|
||||
|
||||
def assert_full_coverage(
|
||||
enumerated: list[str], aggregate: dict, dry_run: bool, max_stragglers: int = 0
|
||||
) -> None:
|
||||
"""Gate the rollout on coverage, tolerating a quarantined straggler minority.
|
||||
def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None:
|
||||
"""Fail the rollout if any enumerated tenant is not on the target build.
|
||||
|
||||
This is the no-silent-skip gate (internal#724) made resilient: every
|
||||
enumerated tenant must be PROVEN on the target build, EXCEPT up to
|
||||
``max_stragglers`` individually-stuck tenants which are quarantined (shipped
|
||||
past) and reported for separate recovery instead of blocking the whole
|
||||
fleet deploy. Exceeding the tolerance is a systemic failure → RolloutFailed.
|
||||
A dry run proves nothing landed, so coverage is not asserted for it.
|
||||
This is the no-silent-skip gate (internal#724). A dry run proves
|
||||
nothing landed, so coverage is not asserted for it.
|
||||
"""
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
stragglers = rollout_stragglers(enumerated, aggregate.get("results") or [])
|
||||
if not stragglers:
|
||||
return
|
||||
# Surface the stragglers (for the step summary + recovery), gate or not.
|
||||
aggregate["stragglers"] = stragglers
|
||||
if len(stragglers) > max_stragglers:
|
||||
if stragglers:
|
||||
msg = (
|
||||
f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target "
|
||||
f"after redeploy-fleet (max tolerated {max_stragglers}): {', '.join(stragglers)} "
|
||||
f"after redeploy-fleet: {', '.join(stragglers)} "
|
||||
f"(enumerated {len(set(enumerated))})"
|
||||
)
|
||||
aggregate["ok"] = False
|
||||
aggregate["error"] = msg
|
||||
aggregate["stragglers"] = stragglers
|
||||
raise RolloutFailed(msg, aggregate)
|
||||
# Within tolerance: shipped to the healthy majority; quarantine is loud,
|
||||
# not fatal. The deploy succeeds; the stragglers need individual recovery.
|
||||
print(
|
||||
f"::warning::quarantined {len(stragglers)} straggler(s) (<= max {max_stragglers}); "
|
||||
f"shipped to the rest of the fleet — these need recovery: {', '.join(stragglers)}"
|
||||
)
|
||||
|
||||
|
||||
def execute_scoped_rollout(
|
||||
@@ -348,8 +325,7 @@ def execute_scoped_rollout(
|
||||
# or one enumerated but never batched, is a straggler. Surfacing it as
|
||||
# a RolloutFailed makes the deploy step exit non-zero instead of
|
||||
# silently reporting success (the exact agents-team failure mode).
|
||||
max_stragglers = int(base_body.get("max_stragglers") or 0)
|
||||
assert_full_coverage(all_slugs, aggregate, dry_run, max_stragglers)
|
||||
assert_full_coverage(all_slugs, aggregate, dry_run)
|
||||
|
||||
return aggregate
|
||||
|
||||
|
||||
@@ -351,8 +351,7 @@ def compute_ack_state(
|
||||
latest_directive[(user, slug)] = kind
|
||||
|
||||
# Step 2: build candidate ackers per slug.
|
||||
# Filter out self-acks and unknown slugs. Author self-ack is forbidden
|
||||
# per .gitea/sop-checklist-config.yaml — a non-author peer must ack.
|
||||
# Filter out self-acks and unknown slugs.
|
||||
ackers_per_slug: dict[str, list[str]] = {s: [] for s in items_by_slug}
|
||||
rejected_self: dict[str, list[str]] = {s: [] for s in items_by_slug}
|
||||
pending_team_check: dict[str, list[str]] = {s: [] for s in items_by_slug}
|
||||
|
||||
@@ -35,9 +35,6 @@ def test_build_plan_defaults_to_staging_sha_target_and_prod_cp():
|
||||
"canary_slug": "hongming",
|
||||
"soak_seconds": 60,
|
||||
"batch_size": 3,
|
||||
# quarantine up to 1 individually-stuck tenant rather than blocking the
|
||||
# whole fleet deploy (default).
|
||||
"max_stragglers": 1,
|
||||
"dry_run": False,
|
||||
# cp#228 / task #308: fleet-wide intent must carry confirm:true.
|
||||
"confirm": True,
|
||||
@@ -473,72 +470,6 @@ def test_scoped_rollout_passes_when_all_tenants_verified_on_target():
|
||||
assert "stragglers" not in aggregate
|
||||
|
||||
|
||||
def test_scoped_rollout_quarantines_straggler_within_tolerance():
|
||||
# reno-stars never verifies on target; max_stragglers=1 tolerates it — the
|
||||
# rollout still succeeds (ships to the healthy majority) and reports the
|
||||
# quarantined straggler instead of failing the whole deploy.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [
|
||||
{"slug": s, "verified_on_target": (s != "reno-stars")}
|
||||
for s in body["only_slugs"]
|
||||
],
|
||||
}
|
||||
|
||||
aggregate = prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
"max_stragglers": 1,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
assert aggregate["ok"] is True
|
||||
assert aggregate["stragglers"] == ["reno-stars"]
|
||||
|
||||
|
||||
def test_scoped_rollout_fails_when_stragglers_exceed_tolerance():
|
||||
# Two tenants never verify; with max_stragglers=1 that is systemic → fail.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [
|
||||
{"slug": s, "verified_on_target": (s == "hongming")}
|
||||
for s in body["only_slugs"]
|
||||
],
|
||||
}
|
||||
|
||||
try:
|
||||
prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
"max_stragglers": 1,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
raise AssertionError("expected RolloutFailed when stragglers exceed tolerance")
|
||||
except prod.RolloutFailed as exc:
|
||||
assert "max tolerated 1" in str(exc)
|
||||
|
||||
|
||||
def test_scoped_rollout_dry_run_does_not_assert_coverage():
|
||||
# A dry run proves nothing landed; coverage must NOT be asserted or
|
||||
# every plan would fail.
|
||||
|
||||
@@ -291,8 +291,7 @@ class TestComputeAckState(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
|
||||
|
||||
def test_self_ack_rejected_when_author_in_team(self):
|
||||
# Author self-acks are forbidden — a non-author peer must ack.
|
||||
def test_self_ack_rejected(self):
|
||||
comments = [_comment("alice", "/sop-ack comprehensive-testing")]
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, self._approve_all
|
||||
@@ -723,16 +722,16 @@ class TestRootCauseAckEligibilityWidened(unittest.TestCase):
|
||||
)
|
||||
self.assertEqual(state["root-cause"]["ackers"], ["hongming"])
|
||||
|
||||
def test_self_ack_rejected_with_widened_eligibility(self):
|
||||
# Author self-acks are forbidden even when the author is in the
|
||||
# required team — a non-author peer must ack.
|
||||
def test_self_ack_still_forbidden_even_with_widened_eligibility(self):
|
||||
# Author cannot self-ack — widening teams must NOT weaken
|
||||
# the non-author rule.
|
||||
comments = [_comment("alice", "/sop-ack root-cause")]
|
||||
probe = self._approve_only({"alice"})
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe, high_risk=False
|
||||
)
|
||||
self.assertEqual(state["root-cause"]["ackers"], [])
|
||||
self.assertEqual(state["root-cause"]["rejected"]["self_ack"], ["alice"])
|
||||
self.assertIn("alice", state["root-cause"]["rejected"]["self_ack"])
|
||||
|
||||
|
||||
class TestHighRiskClassUsesElevatedListInConfig(unittest.TestCase):
|
||||
|
||||
@@ -78,12 +78,6 @@ jobs:
|
||||
# even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
|
||||
MOLECULE_ENV: development
|
||||
SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
|
||||
# act_runner runs the job inside a Docker container, so /.dockerenv exists
|
||||
# and the platform auto-detects platformInDocker=true. But the job container
|
||||
# is NOT on molecule-core-net, so it cannot resolve workspace container
|
||||
# hostnames (ws-<id>:8000). Force false so the proxy keeps using the
|
||||
# host-mapped 127.0.0.1:<ephemeral_port> URL, which IS reachable.
|
||||
MOLECULE_IN_DOCKER: false
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
@@ -138,29 +132,7 @@ jobs:
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://127.0.0.1:${PORT}" >> "$GITHUB_ENV"
|
||||
# Discover an IP that Docker containers can use to reach the host platform.
|
||||
# host.docker.internal is not reliably available on Linux (act_runner), so
|
||||
# workspace containers cannot resolve it and fail to register/heartbeat.
|
||||
# Workspace containers join molecule-core-net; the host is reachable via that
|
||||
# network's gateway. Ensure the network exists first (the provisioner creates
|
||||
# it lazily, but we need the gateway BEFORE starting the platform).
|
||||
docker network inspect molecule-core-net >/dev/null 2>&1 || docker network create molecule-core-net >/dev/null
|
||||
# Parse Gateway from raw JSON because --format '{{.IPAM.Config}}' is
|
||||
# inconsistent across Docker versions (sometimes omits Gateway field).
|
||||
PLATFORM_HOST_IP=$(docker network inspect molecule-core-net 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(docker network inspect bridge 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(ip route | awk '/default/ {print $3}' | head -1 || true)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
echo "::error::Could not determine PLATFORM_HOST_IP for Docker containers to reach the platform"
|
||||
exit 1
|
||||
fi
|
||||
echo "PLATFORM_HOST_IP=${PLATFORM_HOST_IP}"
|
||||
echo "PLATFORM_URL=http://${PLATFORM_HOST_IP}:${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
# Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
|
||||
# bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
|
||||
T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
@@ -201,10 +173,8 @@ jobs:
|
||||
run: |
|
||||
# Bind to the dynamically allocated port (see #2450).
|
||||
# DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
|
||||
# $GITHUB_ENV. PLATFORM_URL is also passed explicitly because
|
||||
# $GITHUB_ENV propagation can be flaky on act_runner (#2468 RCA).
|
||||
echo "starting platform with PLATFORM_URL=${PLATFORM_URL:-<fallback>} PORT=$PORT BIND_ADDR=0.0.0.0"
|
||||
PORT=$PORT BIND_ADDR=0.0.0.0 PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:$PORT}" ./platform-server > platform.log 2>&1 &
|
||||
# $GITHUB_ENV.
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
@@ -228,11 +198,6 @@ jobs:
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Verify platform reachable from molecule-core-net
|
||||
run: |
|
||||
echo "Testing platform reachability from molecule-core-net container..."
|
||||
docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- http://${PLATFORM_URL#http://}/health" || echo "WARN: platform not reachable from molecule-core-net"
|
||||
|
||||
- name: Run local-provision lifecycle E2E (stub — REQUIRED)
|
||||
run: bash tests/e2e/test_local_provision_lifecycle_e2e.sh
|
||||
|
||||
@@ -240,15 +205,6 @@ jobs:
|
||||
if: failure()
|
||||
run: cat workspace-server/platform.log || true
|
||||
|
||||
- name: Dump workspace container logs on failure
|
||||
if: failure()
|
||||
run: |
|
||||
WS_NAME=$(docker ps --filter "name=ws-" --format '{{.Names}}' | head -1 || true)
|
||||
if [ -n "$WS_NAME" ]; then
|
||||
echo "=== Workspace container logs for $WS_NAME ==="
|
||||
docker logs "$WS_NAME" 2>&1 | tail -n 80 || true
|
||||
fi
|
||||
|
||||
- name: Stop platform
|
||||
if: always()
|
||||
run: |
|
||||
@@ -292,12 +248,6 @@ jobs:
|
||||
# even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
|
||||
MOLECULE_ENV: development
|
||||
SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
|
||||
# act_runner runs the job inside a Docker container, so /.dockerenv exists
|
||||
# and the platform auto-detects platformInDocker=true. But the job container
|
||||
# is NOT on molecule-core-net, so it cannot resolve workspace container
|
||||
# hostnames (ws-<id>:8000). Force false so the proxy keeps using the
|
||||
# host-mapped 127.0.0.1:<ephemeral_port> URL, which IS reachable.
|
||||
MOLECULE_IN_DOCKER: false
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
@@ -347,29 +297,7 @@ jobs:
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://127.0.0.1:${PORT}" >> "$GITHUB_ENV"
|
||||
# Discover an IP that Docker containers can use to reach the host platform.
|
||||
# host.docker.internal is not reliably available on Linux (act_runner), so
|
||||
# workspace containers cannot resolve it and fail to register/heartbeat.
|
||||
# Workspace containers join molecule-core-net; the host is reachable via that
|
||||
# network's gateway. Ensure the network exists first (the provisioner creates
|
||||
# it lazily, but we need the gateway BEFORE starting the platform).
|
||||
docker network inspect molecule-core-net >/dev/null 2>&1 || docker network create molecule-core-net >/dev/null
|
||||
# Parse Gateway from raw JSON because --format '{{.IPAM.Config}}' is
|
||||
# inconsistent across Docker versions (sometimes omits Gateway field).
|
||||
PLATFORM_HOST_IP=$(docker network inspect molecule-core-net 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(docker network inspect bridge 2>/dev/null | sed -n 's/.*"Gateway": "\([^"]*\)".*/\1/p' | head -1)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
PLATFORM_HOST_IP=$(ip route | awk '/default/ {print $3}' | head -1 || true)
|
||||
fi
|
||||
if [ -z "$PLATFORM_HOST_IP" ]; then
|
||||
echo "::error::Could not determine PLATFORM_HOST_IP for Docker containers to reach the platform"
|
||||
exit 1
|
||||
fi
|
||||
echo "PLATFORM_HOST_IP=${PLATFORM_HOST_IP}"
|
||||
echo "PLATFORM_URL=http://${PLATFORM_HOST_IP}:${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
@@ -401,8 +329,7 @@ jobs:
|
||||
- name: Start platform (background)
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
echo "starting platform with PLATFORM_URL=${PLATFORM_URL:-<fallback>} PORT=$PORT BIND_ADDR=0.0.0.0"
|
||||
PORT=$PORT BIND_ADDR=0.0.0.0 PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:$PORT}" ./platform-server > platform.log 2>&1 &
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
@@ -424,11 +351,6 @@ jobs:
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Verify platform reachable from molecule-core-net
|
||||
run: |
|
||||
echo "Testing platform reachability from molecule-core-net container..."
|
||||
docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- http://${PLATFORM_URL#http://}/health" || echo "WARN: platform not reachable from molecule-core-net"
|
||||
|
||||
- name: Run local-provision lifecycle E2E (real image + MiniMax LLM — ADVISORY)
|
||||
env:
|
||||
# LIFECYCLE_LLM=minimax: provision the REAL claude-code template image
|
||||
@@ -453,15 +375,6 @@ jobs:
|
||||
if: failure()
|
||||
run: cat workspace-server/platform.log || true
|
||||
|
||||
- name: Dump workspace container logs on failure
|
||||
if: failure()
|
||||
run: |
|
||||
WS_NAME=$(docker ps --filter "name=ws-" --format '{{.Names}}' | head -1 || true)
|
||||
if [ -n "$WS_NAME" ]; then
|
||||
echo "=== Workspace container logs for $WS_NAME ==="
|
||||
docker logs "$WS_NAME" 2>&1 | tail -n 80 || true
|
||||
fi
|
||||
|
||||
- name: Stop platform
|
||||
if: always()
|
||||
run: |
|
||||
|
||||
@@ -530,20 +530,7 @@ jobs:
|
||||
STALE_COUNT=0
|
||||
UNREACHABLE_COUNT=0
|
||||
UNHEALTHY_COUNT=0
|
||||
QUARANTINED_COUNT=0
|
||||
# Quarantined stragglers: the CP shipped the build to the healthy
|
||||
# majority and quarantined a small minority within tolerance
|
||||
# (max_stragglers). They are reported + recovered SEPARATELY, so they
|
||||
# must not red the strict per-tenant verify — otherwise one stuck
|
||||
# tenant blocks the whole deploy, the all-or-nothing trap this fixes.
|
||||
STRAGGLERS_LIST="$(jq -r '(.stragglers // [])[]' "$RESP" 2>/dev/null || true)"
|
||||
is_straggler() { printf '%s\n' "$STRAGGLERS_LIST" | grep -qxF "$1"; }
|
||||
for slug in "${SLUGS[@]}"; do
|
||||
if is_straggler "$slug"; then
|
||||
echo "::warning::$slug is a QUARANTINED straggler — build shipped to the rest of the fleet; this tenant needs individual recovery. Skipping strict verify."
|
||||
QUARANTINED_COUNT=$((QUARANTINED_COUNT + 1))
|
||||
continue
|
||||
fi
|
||||
healthz_ok="$(jq -r --arg slug "$slug" '.results[]? | select(.slug == $slug) | .healthz_ok' "$RESP" | tail -1)"
|
||||
if [ "$healthz_ok" != "true" ]; then
|
||||
echo "::error::$slug did not report healthz_ok=true in redeploy-fleet response."
|
||||
@@ -593,7 +580,6 @@ jobs:
|
||||
echo "Stale tenants: $STALE_COUNT"
|
||||
echo "Unhealthy tenants: $UNHEALTHY_COUNT"
|
||||
echo "Unreachable tenants: $UNREACHABLE_COUNT"
|
||||
echo "Quarantined stragglers (shipped past; need recovery): $QUARANTINED_COUNT"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then
|
||||
|
||||
@@ -40,24 +40,14 @@ export function FlightEnvelope({
|
||||
if (!el || typeof el.animate !== "function") return;
|
||||
const dx = to.x - from.x;
|
||||
const dy = to.y - from.y;
|
||||
// Launch small from the source dot, GROW BIG as it crosses the gap (peak
|
||||
// mid-flight), then SHRINK small as it lands on the target dot — reads as an
|
||||
// envelope flung from one agent and received by the other. translate tracks
|
||||
// the straight path (fraction == keyframe offset); scale arcs independently.
|
||||
const at = (frac: number, scale: number, opacity: number, offset?: number) => ({
|
||||
transform: `translate(-50%,-50%) translate(${dx * frac}px,${dy * frac}px) scale(${scale})`,
|
||||
opacity,
|
||||
...(offset === undefined ? {} : { offset }),
|
||||
});
|
||||
const anim = el.animate(
|
||||
[
|
||||
at(0, 0.5, 0),
|
||||
at(0.2, 1.25, 1, 0.2), // faded in + grown
|
||||
at(0.5, 1.7, 1, 0.5), // BIG at mid-flight
|
||||
at(0.82, 1.05, 1, 0.82), // shrinking on approach
|
||||
at(1, 0.5, 0), // small + faded out, arrived on the target dot
|
||||
{ transform: "translate(-50%,-50%) translate(0px,0px) scale(0.45)", opacity: 0 },
|
||||
{ opacity: 1, offset: 0.16 },
|
||||
{ opacity: 1, offset: 0.8 },
|
||||
{ transform: `translate(-50%,-50%) translate(${dx}px,${dy}px) scale(1)`, opacity: 0 },
|
||||
],
|
||||
{ duration: FLIGHT_DURATION_MS, easing: "ease-in-out", fill: "forwards" },
|
||||
{ duration: FLIGHT_DURATION_MS, easing: "cubic-bezier(0.45, 0, 0.25, 1)", fill: "forwards" },
|
||||
);
|
||||
return () => anim.cancel();
|
||||
}, [from.x, from.y, to.x, to.y]);
|
||||
|
||||
@@ -4,25 +4,17 @@
|
||||
* Mounted INSIDE <ReactFlow> so its ViewportPortal places the envelope in flow
|
||||
* coordinates; it therefore pans and zooms with the canvas for free. The
|
||||
* flight lifecycle (which events become envelopes, reduced-motion opt-out,
|
||||
* expiry) lives in useA2AFlights — this component only resolves endpoints and
|
||||
* renders.
|
||||
*
|
||||
* Endpoints anchor on each workspace's STATUS DOT (the green/glowing presence
|
||||
* indicator), not the card's geometric centre — so an envelope visibly leaves
|
||||
* the source agent's dot and lands on the target agent's dot. The dot carries
|
||||
* `data-flight-anchor`; we read its rendered rect and convert screen→flow via
|
||||
* React Flow, falling back to the card centre only when the dot isn't in the
|
||||
* DOM yet (node just mounted / scrolled out). */
|
||||
import { useRef } from "react";
|
||||
import { ViewportPortal, useReactFlow, type Node } from "@xyflow/react";
|
||||
* expiry) lives in useA2AFlights — this component only resolves node centres
|
||||
* and renders. */
|
||||
import { ViewportPortal, type Node } from "@xyflow/react";
|
||||
import { useCanvasStore } from "@/store/canvas";
|
||||
import { useA2AFlights, type A2AFlight } from "@/hooks/useA2AFlights";
|
||||
import { useA2AFlights } from "@/hooks/useA2AFlights";
|
||||
import { FlightEnvelope, type Point } from "./FlightEnvelope";
|
||||
import type { WorkspaceNodeData } from "@/store/canvas";
|
||||
|
||||
// Fallback node footprint when React Flow has not measured a node yet. Matches
|
||||
// WorkspaceNode's leaf size (w-[300px] min-h-[176px]); a slightly-off centre for
|
||||
// the first frame after mount is invisible at flight scale.
|
||||
// WorkspaceNode's leaf size (w-[300px] min-h-[176px]); a slightly-off centre
|
||||
// for the first frame after mount is invisible at flight scale.
|
||||
const DEFAULT_W = 300;
|
||||
const DEFAULT_H = 176;
|
||||
|
||||
@@ -32,76 +24,23 @@ function nodeCenter(n: Node<WorkspaceNodeData>): Point {
|
||||
return { x: n.position.x + w / 2, y: n.position.y + h / 2 };
|
||||
}
|
||||
|
||||
/** Resolve a node's status-dot centre in FLOW coordinates. Reads the dot's
|
||||
* rendered screen rect (it carries data-flight-anchor) and converts it back to
|
||||
* flow space, so the anchor is exact regardless of pan/zoom and survives any
|
||||
* header-layout change. Falls back to the card centre when the dot isn't
|
||||
* rendered. */
|
||||
function dotAnchor(
|
||||
n: Node<WorkspaceNodeData>,
|
||||
screenToFlowPosition: (p: Point) => Point,
|
||||
): Point {
|
||||
if (typeof document !== "undefined") {
|
||||
const id =
|
||||
typeof CSS !== "undefined" && typeof CSS.escape === "function" ? CSS.escape(n.id) : n.id;
|
||||
const el = document.querySelector<HTMLElement>(
|
||||
`.react-flow__node[data-id="${id}"] [data-flight-anchor]`,
|
||||
);
|
||||
if (el) {
|
||||
const r = el.getBoundingClientRect();
|
||||
if (r.width > 0 && r.height > 0) {
|
||||
return screenToFlowPosition({ x: r.left + r.width / 2, y: r.top + r.height / 2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
return nodeCenter(n);
|
||||
}
|
||||
|
||||
/** One flight. Captures the source/target dot anchors ONCE on mount (a ref, not
|
||||
* per-render) so a pan/zoom or re-render mid-flight doesn't restart the
|
||||
* animation — mirrors HomeFlight's capture-once contract. */
|
||||
function CanvasFlight({
|
||||
flight,
|
||||
nodes,
|
||||
screenToFlowPosition,
|
||||
}: {
|
||||
flight: A2AFlight;
|
||||
nodes: Node<WorkspaceNodeData>[];
|
||||
screenToFlowPosition: (p: Point) => Point;
|
||||
}) {
|
||||
const pos = useRef<{ from: Point; to: Point } | null>(null);
|
||||
if (pos.current === null) {
|
||||
const src = nodes.find((n) => n.id === flight.sourceId);
|
||||
const dst = nodes.find((n) => n.id === flight.targetId);
|
||||
// Both endpoints must be on-canvas to draw a path between them.
|
||||
if (src && dst) {
|
||||
pos.current = {
|
||||
from: dotAnchor(src, screenToFlowPosition),
|
||||
to: dotAnchor(dst, screenToFlowPosition),
|
||||
};
|
||||
}
|
||||
}
|
||||
if (!pos.current) return null;
|
||||
return <FlightEnvelope from={pos.current.from} to={pos.current.to} kind={flight.kind} />;
|
||||
}
|
||||
|
||||
export function MessageFlightLayer() {
|
||||
const flights = useA2AFlights();
|
||||
const nodes = useCanvasStore((s) => s.nodes) as Node<WorkspaceNodeData>[];
|
||||
const { screenToFlowPosition } = useReactFlow();
|
||||
const nodes = useCanvasStore((s) => s.nodes);
|
||||
|
||||
if (flights.length === 0) return null;
|
||||
|
||||
return (
|
||||
<ViewportPortal>
|
||||
{flights.map((f) => (
|
||||
<CanvasFlight
|
||||
key={f.key}
|
||||
flight={f}
|
||||
nodes={nodes}
|
||||
screenToFlowPosition={screenToFlowPosition}
|
||||
/>
|
||||
))}
|
||||
{flights.map((f) => {
|
||||
const src = nodes.find((n) => n.id === f.sourceId);
|
||||
const dst = nodes.find((n) => n.id === f.targetId);
|
||||
// Both endpoints must be on-canvas to draw a path between them.
|
||||
if (!src || !dst) return null;
|
||||
return (
|
||||
<FlightEnvelope key={f.key} from={nodeCenter(src)} to={nodeCenter(dst)} kind={f.kind} />
|
||||
);
|
||||
})}
|
||||
</ViewportPortal>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -215,7 +215,7 @@ export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>)
|
||||
{/* Header row */}
|
||||
<div className="flex items-center justify-between gap-2 mb-2.5">
|
||||
<div className="flex items-center gap-2.5 min-w-0">
|
||||
<div data-flight-anchor className={`w-2.5 h-2.5 rounded-full shrink-0 ${statusCfg.dot} ${statusCfg.glow} shadow-sm`} />
|
||||
<div className={`w-2.5 h-2.5 rounded-full shrink-0 ${statusCfg.dot} ${statusCfg.glow} shadow-sm`} />
|
||||
<span className="text-[15px] font-semibold text-ink truncate leading-tight">
|
||||
{data.name}
|
||||
</span>
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
// @vitest-environment jsdom
|
||||
/**
|
||||
* Tests for FlightEnvelope — the envelope that animates from `from` to `to`.
|
||||
*
|
||||
* Locks the render contract the canvas + concierge-home both depend on:
|
||||
* - the envelope is positioned at the `from` point (its launch anchor),
|
||||
* - it is coloured by activity kind,
|
||||
* - it degrades gracefully when Element.animate is unavailable (jsdom / SSR).
|
||||
*
|
||||
* The grow→shrink scale arc itself uses the Web Animations API, which jsdom
|
||||
* does not implement, so we assert the static render + graceful degradation
|
||||
* rather than keyframe values.
|
||||
*/
|
||||
import React from "react";
|
||||
import { render, cleanup } from "@testing-library/react";
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { FlightEnvelope } from "../FlightEnvelope";
|
||||
|
||||
afterEach(cleanup);
|
||||
|
||||
describe("FlightEnvelope", () => {
|
||||
it("positions the envelope at the `from` launch point", () => {
|
||||
const { getByTestId } = render(
|
||||
<FlightEnvelope from={{ x: 120, y: 240 }} to={{ x: 400, y: 60 }} kind="send" />,
|
||||
);
|
||||
const el = getByTestId("flight-envelope");
|
||||
expect(el.style.left).toBe("120px");
|
||||
expect(el.style.top).toBe("240px");
|
||||
expect(el.querySelector("svg")).toBeTruthy();
|
||||
});
|
||||
|
||||
it("colours the envelope by activity kind", () => {
|
||||
const stroke = (kind: "send" | "receive" | "task") => {
|
||||
const { container } = render(
|
||||
<FlightEnvelope from={{ x: 0, y: 0 }} to={{ x: 10, y: 10 }} kind={kind} />,
|
||||
);
|
||||
const s = container.querySelector("rect")?.getAttribute("stroke");
|
||||
cleanup();
|
||||
return s;
|
||||
};
|
||||
expect(stroke("send")).toBe("#22d3ee");
|
||||
expect(stroke("receive")).toBe("#8b5cf6");
|
||||
expect(stroke("task")).toBe("#f5a623");
|
||||
});
|
||||
|
||||
it("degrades to a static render (no throw) when Element.animate is unavailable", () => {
|
||||
// jsdom does not implement Element.animate — the component must still render.
|
||||
expect(typeof document.createElement("div").animate).not.toBe("function");
|
||||
const { getByTestId } = render(
|
||||
<FlightEnvelope from={{ x: 0, y: 0 }} to={{ x: 1, y: 1 }} kind="task" />,
|
||||
);
|
||||
expect(getByTestId("flight-envelope")).toBeTruthy();
|
||||
});
|
||||
});
|
||||
@@ -272,20 +272,6 @@ func MarkQueueItemFailed(ctx context.Context, id, errMsg string) {
|
||||
}
|
||||
}
|
||||
|
||||
// QueueDepth returns the number of currently-queued (not dispatched/completed)
|
||||
// items for a workspace. Used by the busy-return response body so callers
|
||||
// can see how many ahead of them.
|
||||
func QueueDepth(ctx context.Context, workspaceID string) int {
|
||||
var n int
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT COUNT(*) FROM a2a_queue WHERE workspace_id = $1 AND status = 'queued'`,
|
||||
workspaceID,
|
||||
).Scan(&n); err != nil {
|
||||
log.Printf("A2AQueue: QueueDepth query failed for workspace %s: %v", workspaceID, err)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// DropStaleQueueItems marks queued items older than maxAge as 'dropped' with a
|
||||
// system-generated reason so PM agents stop processing stale post-incident noise.
|
||||
// Called with a workspaceID to scope cleanup to one workspace, or empty to sweep
|
||||
|
||||
Reference in New Issue
Block a user