feat(prod-deploy): tolerate a quarantined straggler minority in the fleet rollout #2484
@@ -66,6 +66,14 @@ def build_plan(env: dict[str, str]) -> dict:
|
||||
"target_tag": target_tag,
|
||||
"soak_seconds": _int_env(env, "PROD_AUTO_DEPLOY_SOAK_SECONDS", 60, minimum=0),
|
||||
"batch_size": _int_env(env, "PROD_AUTO_DEPLOY_BATCH_SIZE", 3),
|
||||
# Tolerate a small minority of individually-stuck tenants (e.g. a wedged
|
||||
# data volume that won't recreate). They are QUARANTINED — shipped past
|
||||
# so the healthy majority still lands the build — and reported for
|
||||
# separate recovery, instead of one stuck tenant blocking the whole
|
||||
# fleet deploy. The canary still must pass, the CP halts a batch the
|
||||
# moment failures exceed this, and the cross-batch coverage gate below
|
||||
# enforces the same tolerance globally. Default 1.
|
||||
"max_stragglers": _int_env(env, "PROD_AUTO_DEPLOY_MAX_STRAGGLERS", 1, minimum=0),
|
||||
"dry_run": truthy_flag(env.get("PROD_AUTO_DEPLOY_DRY_RUN", "")),
|
||||
# confirm:true ack required by CP /cp/admin/tenants/redeploy-fleet
|
||||
# contract (cp#228 / task #308) for fleet-wide intent. Empty body
|
||||
@@ -251,26 +259,41 @@ def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]:
|
||||
return sorted(s for s in dict.fromkeys(enumerated) if s not in verified)
|
||||
|
||||
|
||||
def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None:
|
||||
"""Fail the rollout if any enumerated tenant is not on the target build.
|
||||
def assert_full_coverage(
|
||||
enumerated: list[str], aggregate: dict, dry_run: bool, max_stragglers: int = 0
|
||||
) -> None:
|
||||
"""Gate the rollout on coverage, tolerating a quarantined straggler minority.
|
||||
|
||||
This is the no-silent-skip gate (internal#724). A dry run proves
|
||||
nothing landed, so coverage is not asserted for it.
|
||||
This is the no-silent-skip gate (internal#724) made resilient: every
|
||||
enumerated tenant must be PROVEN on the target build, EXCEPT up to
|
||||
``max_stragglers`` individually-stuck tenants which are quarantined (shipped
|
||||
past) and reported for separate recovery instead of blocking the whole
|
||||
fleet deploy. Exceeding the tolerance is a systemic failure → RolloutFailed.
|
||||
A dry run proves nothing landed, so coverage is not asserted for it.
|
||||
"""
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
stragglers = rollout_stragglers(enumerated, aggregate.get("results") or [])
|
||||
if stragglers:
|
||||
if not stragglers:
|
||||
return
|
||||
# Surface the stragglers (for the step summary + recovery), gate or not.
|
||||
aggregate["stragglers"] = stragglers
|
||||
if len(stragglers) > max_stragglers:
|
||||
msg = (
|
||||
f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target "
|
||||
f"after redeploy-fleet: {', '.join(stragglers)} "
|
||||
f"after redeploy-fleet (max tolerated {max_stragglers}): {', '.join(stragglers)} "
|
||||
f"(enumerated {len(set(enumerated))})"
|
||||
)
|
||||
aggregate["ok"] = False
|
||||
aggregate["error"] = msg
|
||||
aggregate["stragglers"] = stragglers
|
||||
raise RolloutFailed(msg, aggregate)
|
||||
# Within tolerance: shipped to the healthy majority; quarantine is loud,
|
||||
# not fatal. The deploy succeeds; the stragglers need individual recovery.
|
||||
print(
|
||||
f"::warning::quarantined {len(stragglers)} straggler(s) (<= max {max_stragglers}); "
|
||||
f"shipped to the rest of the fleet — these need recovery: {', '.join(stragglers)}"
|
||||
)
|
||||
|
||||
|
||||
def execute_scoped_rollout(
|
||||
@@ -325,7 +348,8 @@ def execute_scoped_rollout(
|
||||
# or one enumerated but never batched, is a straggler. Surfacing it as
|
||||
# a RolloutFailed makes the deploy step exit non-zero instead of
|
||||
# silently reporting success (the exact agents-team failure mode).
|
||||
assert_full_coverage(all_slugs, aggregate, dry_run)
|
||||
max_stragglers = int(base_body.get("max_stragglers") or 0)
|
||||
assert_full_coverage(all_slugs, aggregate, dry_run, max_stragglers)
|
||||
|
||||
return aggregate
|
||||
|
||||
|
||||
@@ -35,6 +35,9 @@ def test_build_plan_defaults_to_staging_sha_target_and_prod_cp():
|
||||
"canary_slug": "hongming",
|
||||
"soak_seconds": 60,
|
||||
"batch_size": 3,
|
||||
# quarantine up to 1 individually-stuck tenant rather than blocking the
|
||||
# whole fleet deploy (default).
|
||||
"max_stragglers": 1,
|
||||
"dry_run": False,
|
||||
# cp#228 / task #308: fleet-wide intent must carry confirm:true.
|
||||
"confirm": True,
|
||||
@@ -470,6 +473,72 @@ def test_scoped_rollout_passes_when_all_tenants_verified_on_target():
|
||||
assert "stragglers" not in aggregate
|
||||
|
||||
|
||||
def test_scoped_rollout_quarantines_straggler_within_tolerance():
|
||||
# reno-stars never verifies on target; max_stragglers=1 tolerates it — the
|
||||
# rollout still succeeds (ships to the healthy majority) and reports the
|
||||
# quarantined straggler instead of failing the whole deploy.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [
|
||||
{"slug": s, "verified_on_target": (s != "reno-stars")}
|
||||
for s in body["only_slugs"]
|
||||
],
|
||||
}
|
||||
|
||||
aggregate = prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
"max_stragglers": 1,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
assert aggregate["ok"] is True
|
||||
assert aggregate["stragglers"] == ["reno-stars"]
|
||||
|
||||
|
||||
def test_scoped_rollout_fails_when_stragglers_exceed_tolerance():
|
||||
# Two tenants never verify; with max_stragglers=1 that is systemic → fail.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [
|
||||
{"slug": s, "verified_on_target": (s == "hongming")}
|
||||
for s in body["only_slugs"]
|
||||
],
|
||||
}
|
||||
|
||||
try:
|
||||
prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
"max_stragglers": 1,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
raise AssertionError("expected RolloutFailed when stragglers exceed tolerance")
|
||||
except prod.RolloutFailed as exc:
|
||||
assert "max tolerated 1" in str(exc)
|
||||
|
||||
|
||||
def test_scoped_rollout_dry_run_does_not_assert_coverage():
|
||||
# A dry run proves nothing landed; coverage must NOT be asserted or
|
||||
# every plan would fail.
|
||||
|
||||
@@ -530,7 +530,20 @@ jobs:
|
||||
STALE_COUNT=0
|
||||
UNREACHABLE_COUNT=0
|
||||
UNHEALTHY_COUNT=0
|
||||
QUARANTINED_COUNT=0
|
||||
# Quarantined stragglers: the CP shipped the build to the healthy
|
||||
# majority and quarantined a small minority within tolerance
|
||||
# (max_stragglers). They are reported + recovered SEPARATELY, so they
|
||||
# must not red the strict per-tenant verify — otherwise one stuck
|
||||
# tenant blocks the whole deploy, the all-or-nothing trap this fixes.
|
||||
STRAGGLERS_LIST="$(jq -r '(.stragglers // [])[]' "$RESP" 2>/dev/null || true)"
|
||||
is_straggler() { printf '%s\n' "$STRAGGLERS_LIST" | grep -qxF "$1"; }
|
||||
for slug in "${SLUGS[@]}"; do
|
||||
if is_straggler "$slug"; then
|
||||
echo "::warning::$slug is a QUARANTINED straggler — build shipped to the rest of the fleet; this tenant needs individual recovery. Skipping strict verify."
|
||||
QUARANTINED_COUNT=$((QUARANTINED_COUNT + 1))
|
||||
continue
|
||||
fi
|
||||
healthz_ok="$(jq -r --arg slug "$slug" '.results[]? | select(.slug == $slug) | .healthz_ok' "$RESP" | tail -1)"
|
||||
if [ "$healthz_ok" != "true" ]; then
|
||||
echo "::error::$slug did not report healthz_ok=true in redeploy-fleet response."
|
||||
@@ -580,6 +593,7 @@ jobs:
|
||||
echo "Stale tenants: $STALE_COUNT"
|
||||
echo "Unhealthy tenants: $UNHEALTHY_COUNT"
|
||||
echo "Unreachable tenants: $UNREACHABLE_COUNT"
|
||||
echo "Quarantined stragglers (shipped past; need recovery): $QUARANTINED_COUNT"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then
|
||||
|
||||
Reference in New Issue
Block a user