diff --git a/.gitea/scripts/prod-auto-deploy.py b/.gitea/scripts/prod-auto-deploy.py index 5cf1b12f6..0d36fbfa6 100644 --- a/.gitea/scripts/prod-auto-deploy.py +++ b/.gitea/scripts/prod-auto-deploy.py @@ -66,6 +66,14 @@ def build_plan(env: dict[str, str]) -> dict: "target_tag": target_tag, "soak_seconds": _int_env(env, "PROD_AUTO_DEPLOY_SOAK_SECONDS", 60, minimum=0), "batch_size": _int_env(env, "PROD_AUTO_DEPLOY_BATCH_SIZE", 3), + # Tolerate a small minority of individually-stuck tenants (e.g. a wedged + # data volume that won't recreate). They are QUARANTINED — shipped past + # so the healthy majority still lands the build — and reported for + # separate recovery, instead of one stuck tenant blocking the whole + # fleet deploy. The canary still must pass, the CP halts a batch the + # moment failures exceed this, and the cross-batch coverage gate below + # enforces the same tolerance globally. Default 1. + "max_stragglers": _int_env(env, "PROD_AUTO_DEPLOY_MAX_STRAGGLERS", 1, minimum=0), "dry_run": truthy_flag(env.get("PROD_AUTO_DEPLOY_DRY_RUN", "")), # confirm:true ack required by CP /cp/admin/tenants/redeploy-fleet # contract (cp#228 / task #308) for fleet-wide intent. Empty body @@ -251,26 +259,41 @@ def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]: return sorted(s for s in dict.fromkeys(enumerated) if s not in verified) -def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None: - """Fail the rollout if any enumerated tenant is not on the target build. +def assert_full_coverage( + enumerated: list[str], aggregate: dict, dry_run: bool, max_stragglers: int = 0 +) -> None: + """Gate the rollout on coverage, tolerating a quarantined straggler minority. - This is the no-silent-skip gate (internal#724). A dry run proves - nothing landed, so coverage is not asserted for it. + This is the no-silent-skip gate (internal#724) made resilient: every + enumerated tenant must be PROVEN on the target build, EXCEPT up to + ``max_stragglers`` individually-stuck tenants which are quarantined (shipped + past) and reported for separate recovery instead of blocking the whole + fleet deploy. Exceeding the tolerance is a systemic failure → RolloutFailed. + A dry run proves nothing landed, so coverage is not asserted for it. """ if dry_run: return stragglers = rollout_stragglers(enumerated, aggregate.get("results") or []) - if stragglers: + if not stragglers: + return + # Surface the stragglers (for the step summary + recovery), gate or not. + aggregate["stragglers"] = stragglers + if len(stragglers) > max_stragglers: msg = ( f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target " - f"after redeploy-fleet: {', '.join(stragglers)} " + f"after redeploy-fleet (max tolerated {max_stragglers}): {', '.join(stragglers)} " f"(enumerated {len(set(enumerated))})" ) aggregate["ok"] = False aggregate["error"] = msg - aggregate["stragglers"] = stragglers raise RolloutFailed(msg, aggregate) + # Within tolerance: shipped to the healthy majority; quarantine is loud, + # not fatal. The deploy succeeds; the stragglers need individual recovery. + print( + f"::warning::quarantined {len(stragglers)} straggler(s) (<= max {max_stragglers}); " + f"shipped to the rest of the fleet — these need recovery: {', '.join(stragglers)}" + ) def execute_scoped_rollout( @@ -325,7 +348,8 @@ def execute_scoped_rollout( # or one enumerated but never batched, is a straggler. Surfacing it as # a RolloutFailed makes the deploy step exit non-zero instead of # silently reporting success (the exact agents-team failure mode). - assert_full_coverage(all_slugs, aggregate, dry_run) + max_stragglers = int(base_body.get("max_stragglers") or 0) + assert_full_coverage(all_slugs, aggregate, dry_run, max_stragglers) return aggregate diff --git a/.gitea/scripts/tests/test_prod_auto_deploy.py b/.gitea/scripts/tests/test_prod_auto_deploy.py index 8641fab82..e36d7d0f0 100644 --- a/.gitea/scripts/tests/test_prod_auto_deploy.py +++ b/.gitea/scripts/tests/test_prod_auto_deploy.py @@ -35,6 +35,9 @@ def test_build_plan_defaults_to_staging_sha_target_and_prod_cp(): "canary_slug": "hongming", "soak_seconds": 60, "batch_size": 3, + # quarantine up to 1 individually-stuck tenant rather than blocking the + # whole fleet deploy (default). + "max_stragglers": 1, "dry_run": False, # cp#228 / task #308: fleet-wide intent must carry confirm:true. "confirm": True, @@ -470,6 +473,72 @@ def test_scoped_rollout_passes_when_all_tenants_verified_on_target(): assert "stragglers" not in aggregate +def test_scoped_rollout_quarantines_straggler_within_tolerance(): + # reno-stars never verifies on target; max_stragglers=1 tolerates it — the + # rollout still succeeds (ships to the healthy majority) and reports the + # quarantined straggler instead of failing the whole deploy. + def fake_redeploy(_cp_url, _token, body): + return 200, { + "ok": True, + "results": [ + {"slug": s, "verified_on_target": (s != "reno-stars")} + for s in body["only_slugs"] + ], + } + + aggregate = prod.execute_scoped_rollout( + { + "cp_url": "https://api.moleculesai.app", + "body": { + "target_tag": "staging-new", + "batch_size": 5, + "dry_run": False, + "confirm": True, + "max_stragglers": 1, + }, + }, + token="secret", + list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"], + redeploy=fake_redeploy, + sleep=lambda _s: None, + ) + assert aggregate["ok"] is True + assert aggregate["stragglers"] == ["reno-stars"] + + +def test_scoped_rollout_fails_when_stragglers_exceed_tolerance(): + # Two tenants never verify; with max_stragglers=1 that is systemic → fail. + def fake_redeploy(_cp_url, _token, body): + return 200, { + "ok": True, + "results": [ + {"slug": s, "verified_on_target": (s == "hongming")} + for s in body["only_slugs"] + ], + } + + try: + prod.execute_scoped_rollout( + { + "cp_url": "https://api.moleculesai.app", + "body": { + "target_tag": "staging-new", + "batch_size": 5, + "dry_run": False, + "confirm": True, + "max_stragglers": 1, + }, + }, + token="secret", + list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"], + redeploy=fake_redeploy, + sleep=lambda _s: None, + ) + raise AssertionError("expected RolloutFailed when stragglers exceed tolerance") + except prod.RolloutFailed as exc: + assert "max tolerated 1" in str(exc) + + def test_scoped_rollout_dry_run_does_not_assert_coverage(): # A dry run proves nothing landed; coverage must NOT be asserted or # every plan would fail. diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 79110d611..c29c11375 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -530,7 +530,20 @@ jobs: STALE_COUNT=0 UNREACHABLE_COUNT=0 UNHEALTHY_COUNT=0 + QUARANTINED_COUNT=0 + # Quarantined stragglers: the CP shipped the build to the healthy + # majority and quarantined a small minority within tolerance + # (max_stragglers). They are reported + recovered SEPARATELY, so they + # must not red the strict per-tenant verify — otherwise one stuck + # tenant blocks the whole deploy, the all-or-nothing trap this fixes. + STRAGGLERS_LIST="$(jq -r '(.stragglers // [])[]' "$RESP" 2>/dev/null || true)" + is_straggler() { printf '%s\n' "$STRAGGLERS_LIST" | grep -qxF "$1"; } for slug in "${SLUGS[@]}"; do + if is_straggler "$slug"; then + echo "::warning::$slug is a QUARANTINED straggler — build shipped to the rest of the fleet; this tenant needs individual recovery. Skipping strict verify." + QUARANTINED_COUNT=$((QUARANTINED_COUNT + 1)) + continue + fi healthz_ok="$(jq -r --arg slug "$slug" '.results[]? | select(.slug == $slug) | .healthz_ok' "$RESP" | tail -1)" if [ "$healthz_ok" != "true" ]; then echo "::error::$slug did not report healthz_ok=true in redeploy-fleet response." @@ -580,6 +593,7 @@ jobs: echo "Stale tenants: $STALE_COUNT" echo "Unhealthy tenants: $UNHEALTHY_COUNT" echo "Unreachable tenants: $UNREACHABLE_COUNT" + echo "Quarantined stragglers (shipped past; need recovery): $QUARANTINED_COUNT" } >> "$GITHUB_STEP_SUMMARY" if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then