feat(prod-deploy): tolerate a quarantined straggler minority in the fleet rollout #2484

Merged
molecule-code-reviewer merged 1 commits from fix/deploy-straggler-tolerance into main 2026-06-09 17:23:16 +00:00
3 changed files with 115 additions and 8 deletions
+32 -8
View File
@@ -66,6 +66,14 @@ def build_plan(env: dict[str, str]) -> dict:
"target_tag": target_tag,
"soak_seconds": _int_env(env, "PROD_AUTO_DEPLOY_SOAK_SECONDS", 60, minimum=0),
"batch_size": _int_env(env, "PROD_AUTO_DEPLOY_BATCH_SIZE", 3),
# Tolerate a small minority of individually-stuck tenants (e.g. a wedged
# data volume that won't recreate). They are QUARANTINED — shipped past
# so the healthy majority still lands the build — and reported for
# separate recovery, instead of one stuck tenant blocking the whole
# fleet deploy. The canary still must pass, the CP halts a batch the
# moment failures exceed this, and the cross-batch coverage gate below
# enforces the same tolerance globally. Default 1.
"max_stragglers": _int_env(env, "PROD_AUTO_DEPLOY_MAX_STRAGGLERS", 1, minimum=0),
"dry_run": truthy_flag(env.get("PROD_AUTO_DEPLOY_DRY_RUN", "")),
# confirm:true ack required by CP /cp/admin/tenants/redeploy-fleet
# contract (cp#228 / task #308) for fleet-wide intent. Empty body
@@ -251,26 +259,41 @@ def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]:
return sorted(s for s in dict.fromkeys(enumerated) if s not in verified)
def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None:
"""Fail the rollout if any enumerated tenant is not on the target build.
def assert_full_coverage(
enumerated: list[str], aggregate: dict, dry_run: bool, max_stragglers: int = 0
) -> None:
"""Gate the rollout on coverage, tolerating a quarantined straggler minority.
This is the no-silent-skip gate (internal#724). A dry run proves
nothing landed, so coverage is not asserted for it.
This is the no-silent-skip gate (internal#724) made resilient: every
enumerated tenant must be PROVEN on the target build, EXCEPT up to
``max_stragglers`` individually-stuck tenants which are quarantined (shipped
past) and reported for separate recovery instead of blocking the whole
fleet deploy. Exceeding the tolerance is a systemic failure → RolloutFailed.
A dry run proves nothing landed, so coverage is not asserted for it.
"""
if dry_run:
return
stragglers = rollout_stragglers(enumerated, aggregate.get("results") or [])
if stragglers:
if not stragglers:
return
# Surface the stragglers (for the step summary + recovery), gate or not.
aggregate["stragglers"] = stragglers
if len(stragglers) > max_stragglers:
msg = (
f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target "
f"after redeploy-fleet: {', '.join(stragglers)} "
f"after redeploy-fleet (max tolerated {max_stragglers}): {', '.join(stragglers)} "
f"(enumerated {len(set(enumerated))})"
)
aggregate["ok"] = False
aggregate["error"] = msg
aggregate["stragglers"] = stragglers
raise RolloutFailed(msg, aggregate)
# Within tolerance: shipped to the healthy majority; quarantine is loud,
# not fatal. The deploy succeeds; the stragglers need individual recovery.
print(
f"::warning::quarantined {len(stragglers)} straggler(s) (<= max {max_stragglers}); "
f"shipped to the rest of the fleet — these need recovery: {', '.join(stragglers)}"
)
def execute_scoped_rollout(
@@ -325,7 +348,8 @@ def execute_scoped_rollout(
# or one enumerated but never batched, is a straggler. Surfacing it as
# a RolloutFailed makes the deploy step exit non-zero instead of
# silently reporting success (the exact agents-team failure mode).
assert_full_coverage(all_slugs, aggregate, dry_run)
max_stragglers = int(base_body.get("max_stragglers") or 0)
assert_full_coverage(all_slugs, aggregate, dry_run, max_stragglers)
return aggregate
@@ -35,6 +35,9 @@ def test_build_plan_defaults_to_staging_sha_target_and_prod_cp():
"canary_slug": "hongming",
"soak_seconds": 60,
"batch_size": 3,
# quarantine up to 1 individually-stuck tenant rather than blocking the
# whole fleet deploy (default).
"max_stragglers": 1,
"dry_run": False,
# cp#228 / task #308: fleet-wide intent must carry confirm:true.
"confirm": True,
@@ -470,6 +473,72 @@ def test_scoped_rollout_passes_when_all_tenants_verified_on_target():
assert "stragglers" not in aggregate
def test_scoped_rollout_quarantines_straggler_within_tolerance():
# reno-stars never verifies on target; max_stragglers=1 tolerates it — the
# rollout still succeeds (ships to the healthy majority) and reports the
# quarantined straggler instead of failing the whole deploy.
def fake_redeploy(_cp_url, _token, body):
return 200, {
"ok": True,
"results": [
{"slug": s, "verified_on_target": (s != "reno-stars")}
for s in body["only_slugs"]
],
}
aggregate = prod.execute_scoped_rollout(
{
"cp_url": "https://api.moleculesai.app",
"body": {
"target_tag": "staging-new",
"batch_size": 5,
"dry_run": False,
"confirm": True,
"max_stragglers": 1,
},
},
token="secret",
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
redeploy=fake_redeploy,
sleep=lambda _s: None,
)
assert aggregate["ok"] is True
assert aggregate["stragglers"] == ["reno-stars"]
def test_scoped_rollout_fails_when_stragglers_exceed_tolerance():
# Two tenants never verify; with max_stragglers=1 that is systemic → fail.
def fake_redeploy(_cp_url, _token, body):
return 200, {
"ok": True,
"results": [
{"slug": s, "verified_on_target": (s == "hongming")}
for s in body["only_slugs"]
],
}
try:
prod.execute_scoped_rollout(
{
"cp_url": "https://api.moleculesai.app",
"body": {
"target_tag": "staging-new",
"batch_size": 5,
"dry_run": False,
"confirm": True,
"max_stragglers": 1,
},
},
token="secret",
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
redeploy=fake_redeploy,
sleep=lambda _s: None,
)
raise AssertionError("expected RolloutFailed when stragglers exceed tolerance")
except prod.RolloutFailed as exc:
assert "max tolerated 1" in str(exc)
def test_scoped_rollout_dry_run_does_not_assert_coverage():
# A dry run proves nothing landed; coverage must NOT be asserted or
# every plan would fail.
@@ -530,7 +530,20 @@ jobs:
STALE_COUNT=0
UNREACHABLE_COUNT=0
UNHEALTHY_COUNT=0
QUARANTINED_COUNT=0
# Quarantined stragglers: the CP shipped the build to the healthy
# majority and quarantined a small minority within tolerance
# (max_stragglers). They are reported + recovered SEPARATELY, so they
# must not red the strict per-tenant verify — otherwise one stuck
# tenant blocks the whole deploy, the all-or-nothing trap this fixes.
STRAGGLERS_LIST="$(jq -r '(.stragglers // [])[]' "$RESP" 2>/dev/null || true)"
is_straggler() { printf '%s\n' "$STRAGGLERS_LIST" | grep -qxF "$1"; }
for slug in "${SLUGS[@]}"; do
if is_straggler "$slug"; then
echo "::warning::$slug is a QUARANTINED straggler — build shipped to the rest of the fleet; this tenant needs individual recovery. Skipping strict verify."
QUARANTINED_COUNT=$((QUARANTINED_COUNT + 1))
continue
fi
healthz_ok="$(jq -r --arg slug "$slug" '.results[]? | select(.slug == $slug) | .healthz_ok' "$RESP" | tail -1)"
if [ "$healthz_ok" != "true" ]; then
echo "::error::$slug did not report healthz_ok=true in redeploy-fleet response."
@@ -580,6 +593,7 @@ jobs:
echo "Stale tenants: $STALE_COUNT"
echo "Unhealthy tenants: $UNHEALTHY_COUNT"
echo "Unreachable tenants: $UNREACHABLE_COUNT"
echo "Quarantined stragglers (shipped past; need recovery): $QUARANTINED_COUNT"
} >> "$GITHUB_STEP_SUMMARY"
if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then