fix(prod-auto-deploy): fail on tenants not verified on target build (internal#724) #1998
@@ -208,6 +208,61 @@ def _raise_for_redeploy_result(status: int, body: dict, slugs: list[str]) -> Non
|
||||
)
|
||||
|
||||
|
||||
def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]:
|
||||
"""Return every enumerated tenant NOT proven on the target build.
|
||||
|
||||
A straggler is any tenant the rollout was supposed to cover that the
|
||||
CP could not verify is running the target image tag — whether it
|
||||
errored, was skipped, or SSM-succeeded onto the wrong image
|
||||
(internal#724). CP marks each per-tenant result row with
|
||||
``verified_on_target`` (the REDEPLOY_RUNNING_IMAGE docker-inspect
|
||||
proof). A tenant enumerated for the rollout but absent from the
|
||||
result set (no batch ever ran it) is also a straggler — that is the
|
||||
exact agents-team silent-skip class.
|
||||
|
||||
Backward-compat: an OLDER CP that doesn't emit ``verified_on_target``
|
||||
yet returns rows without the key. Treat a missing key as verified so
|
||||
this surfacing degrades to the previous (ok-based) behavior against an
|
||||
un-upgraded CP, rather than failing every deploy spuriously. Once the
|
||||
CP fix is deployed the key is always present and real stragglers are
|
||||
caught.
|
||||
"""
|
||||
|
||||
verified: set[str] = set()
|
||||
for row in results:
|
||||
if str(row.get("ssm_status") or "") == "DryRun":
|
||||
continue
|
||||
slug = str(row.get("slug") or "").strip()
|
||||
if not slug:
|
||||
continue
|
||||
# Missing key (old CP) => assume verified; present key is authoritative.
|
||||
if "verified_on_target" not in row or row.get("verified_on_target"):
|
||||
verified.add(slug)
|
||||
return sorted(s for s in dict.fromkeys(enumerated) if s not in verified)
|
||||
|
||||
|
||||
def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None:
|
||||
"""Fail the rollout if any enumerated tenant is not on the target build.
|
||||
|
||||
This is the no-silent-skip gate (internal#724). A dry run proves
|
||||
nothing landed, so coverage is not asserted for it.
|
||||
"""
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
stragglers = rollout_stragglers(enumerated, aggregate.get("results") or [])
|
||||
if stragglers:
|
||||
msg = (
|
||||
f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target "
|
||||
f"after redeploy-fleet: {', '.join(stragglers)} "
|
||||
f"(enumerated {len(set(enumerated))})"
|
||||
)
|
||||
aggregate["ok"] = False
|
||||
aggregate["error"] = msg
|
||||
aggregate["stragglers"] = stragglers
|
||||
raise RolloutFailed(msg, aggregate)
|
||||
|
||||
|
||||
def execute_scoped_rollout(
|
||||
plan: dict,
|
||||
token: str,
|
||||
@@ -254,6 +309,14 @@ def execute_scoped_rollout(
|
||||
aggregate["error"] = str(exc)
|
||||
raise RolloutFailed(str(exc), aggregate) from exc
|
||||
|
||||
# No-silent-skip coverage gate (internal#724): every enumerated tenant
|
||||
# must be PROVEN on the target build. A per-tenant HTTP-200/ok response
|
||||
# is not proof — a tenant that SSM-succeeded but stayed on the old tag,
|
||||
# or one enumerated but never batched, is a straggler. Surfacing it as
|
||||
# a RolloutFailed makes the deploy step exit non-zero instead of
|
||||
# silently reporting success (the exact agents-team failure mode).
|
||||
assert_full_coverage(all_slugs, aggregate, dry_run)
|
||||
|
||||
return aggregate
|
||||
|
||||
|
||||
|
||||
@@ -355,3 +355,134 @@ def test_rollout_from_plan_file_writes_partial_response_on_failure(tmp_path):
|
||||
assert response_path.read_text(encoding="utf-8").strip()
|
||||
assert '"ok": false' in response_path.read_text(encoding="utf-8")
|
||||
assert '"slug": "hongming"' in response_path.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# No-silent-skip coverage gate (internal#724)
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_rollout_stragglers_flags_tenant_not_on_target():
|
||||
# b SSM-succeeded but its container is on the old tag → straggler.
|
||||
stragglers = prod.rollout_stragglers(
|
||||
["a", "b", "c"],
|
||||
[
|
||||
{"slug": "a", "verified_on_target": True},
|
||||
{"slug": "b", "verified_on_target": False, "running_image": "platform-tenant:staging-old"},
|
||||
{"slug": "c", "verified_on_target": True},
|
||||
],
|
||||
)
|
||||
assert stragglers == ["b"]
|
||||
|
||||
|
||||
def test_rollout_stragglers_flags_enumerated_tenant_with_no_result():
|
||||
# agents-team class: enumerated but no batch ever produced a row for it.
|
||||
stragglers = prod.rollout_stragglers(
|
||||
["a", "agents-team"],
|
||||
[{"slug": "a", "verified_on_target": True}],
|
||||
)
|
||||
assert stragglers == ["agents-team"]
|
||||
|
||||
|
||||
def test_rollout_stragglers_missing_key_is_backward_compatible():
|
||||
# Older CP without verified_on_target → treat as verified (no spurious fail).
|
||||
stragglers = prod.rollout_stragglers(
|
||||
["a", "b"],
|
||||
[{"slug": "a", "healthz_ok": True}, {"slug": "b", "healthz_ok": True}],
|
||||
)
|
||||
assert stragglers == []
|
||||
|
||||
|
||||
def test_rollout_stragglers_ignores_dry_run_rows():
|
||||
stragglers = prod.rollout_stragglers(
|
||||
["a"], [{"slug": "a", "ssm_status": "DryRun"}]
|
||||
)
|
||||
# dry-run row is skipped, so "a" has no verifying row → straggler.
|
||||
assert stragglers == ["a"]
|
||||
|
||||
|
||||
def test_scoped_rollout_fails_when_a_tenant_stays_on_old_tag():
|
||||
# Every per-tenant call returns ok=True, but agents-team is NOT
|
||||
# verified_on_target. The rollout must still fail loudly — this is
|
||||
# the exact "reported success, one tenant silently skipped" bug.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
rows = []
|
||||
for slug in body["only_slugs"]:
|
||||
rows.append({"slug": slug, "verified_on_target": slug != "agents-team"})
|
||||
return 200, {"ok": True, "results": rows}
|
||||
|
||||
try:
|
||||
prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
except prod.RolloutFailed as exc:
|
||||
assert "incomplete rollout" in str(exc)
|
||||
assert exc.response["stragglers"] == ["agents-team"]
|
||||
assert exc.response["ok"] is False
|
||||
else:
|
||||
raise AssertionError("expected an incomplete rollout to fail loudly")
|
||||
|
||||
|
||||
def test_scoped_rollout_passes_when_all_tenants_verified_on_target():
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [{"slug": s, "verified_on_target": True} for s in body["only_slugs"]],
|
||||
}
|
||||
|
||||
aggregate = prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": False,
|
||||
"confirm": True,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
assert aggregate["ok"] is True
|
||||
assert "stragglers" not in aggregate
|
||||
|
||||
|
||||
def test_scoped_rollout_dry_run_does_not_assert_coverage():
|
||||
# A dry run proves nothing landed; coverage must NOT be asserted or
|
||||
# every plan would fail.
|
||||
def fake_redeploy(_cp_url, _token, body):
|
||||
return 200, {
|
||||
"ok": True,
|
||||
"results": [{"slug": s, "ssm_status": "DryRun"} for s in body["only_slugs"]],
|
||||
}
|
||||
|
||||
aggregate = prod.execute_scoped_rollout(
|
||||
{
|
||||
"cp_url": "https://api.moleculesai.app",
|
||||
"body": {
|
||||
"target_tag": "staging-new",
|
||||
"batch_size": 5,
|
||||
"dry_run": True,
|
||||
"confirm": True,
|
||||
},
|
||||
},
|
||||
token="secret",
|
||||
list_slugs=lambda _u, _t, _b: ["a", "b"],
|
||||
redeploy=fake_redeploy,
|
||||
sleep=lambda _s: None,
|
||||
)
|
||||
assert aggregate["ok"] is True
|
||||
|
||||
@@ -327,13 +327,27 @@ jobs:
|
||||
echo ""
|
||||
echo "### Per-tenant result"
|
||||
echo ""
|
||||
echo "| Slug | Phase | SSM Status | Exit | Healthz | Error present |"
|
||||
echo "|------|-------|------------|------|---------|---------------|"
|
||||
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \((.error // "") != "") |"' "$HTTP_RESPONSE" || true
|
||||
echo "| Slug | Phase | SSM Status | Exit | Healthz | On target | Error present |"
|
||||
echo "|------|-------|------------|------|---------|-----------|---------------|"
|
||||
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.verified_on_target) | \((.error // "") != "") |"' "$HTTP_RESPONSE" || true
|
||||
# internal#724: stragglers are tenants enumerated but not proven
|
||||
# on the target build. Surface them loudly — a non-empty list
|
||||
# means the rollout did NOT fully land.
|
||||
STRAGGLERS="$(jq -r '(.stragglers // []) | join(", ")' "$HTTP_RESPONSE")"
|
||||
if [ -n "$STRAGGLERS" ]; then
|
||||
echo ""
|
||||
echo "### ⚠ Stragglers (NOT on target tag \`$TARGET_TAG\`)"
|
||||
echo ""
|
||||
echo "\`$STRAGGLERS\`"
|
||||
fi
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
OK="$(jq -r '.ok' "$HTTP_RESPONSE")"
|
||||
if [ "$OK" != "true" ]; then
|
||||
STRAGGLERS="$(jq -r '(.stragglers // []) | join(", ")' "$HTTP_RESPONSE")"
|
||||
if [ -n "$STRAGGLERS" ]; then
|
||||
echo "::error::incomplete rollout — tenants not on target tag $TARGET_TAG: $STRAGGLERS"
|
||||
fi
|
||||
echo "::error::redeploy-fleet reported ok=false; production rollout halted."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user