From 367bc1f7fcf7f842d9e8ccec5c9a05515e32bea8 Mon Sep 17 00:00:00 2001 From: hongming Date: Thu, 28 May 2026 14:41:12 -0700 Subject: [PATCH] fix(prod-auto-deploy): fail on tenants not verified on target build (internal#724) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The production auto-deploy aggregated per-tenant redeploy-fleet results but never asserted fleet COVERAGE: a tenant that was enumerated but silently skipped, or that SSM-succeeded onto the old image, passed as a clean deploy. That is how agents-team stayed 46h behind the fleet with no straggler reported. Pairs with the controlplane fix that adds per-tenant verified_on_target (docker-inspect proof the container is on the target tag). This change: - rollout_stragglers(): every enumerated tenant NOT proven on the target build is a straggler — errored, skipped (no result row, the agents-team class), or verified_on_target=false. Backward-compatible: a missing key (pre-fix CP) is treated as verified so the gate degrades to the old ok-based behavior against an un-upgraded CP rather than failing spuriously. - assert_full_coverage(): raises RolloutFailed (→ non-zero exit, response JSON written with ok=false + stragglers) when any straggler remains after a non-dry-run rollout. A dry run asserts nothing (it proves nothing landed). - publish-workspace-server-image.yml: per-tenant summary gains an "On target" column and a loud ⚠ Stragglers section; the step emits a ::error:: naming the off-target tenants before failing. Tests: straggler detection (off-target, no-result, dry-run-skip, backward-compat missing key) + end-to-end execute_scoped_rollout fail/pass — mutation-verified RED with the coverage gate removed. All existing prod-auto-deploy tests still pass; ruff + py_compile clean; workflow YAML validates. Refs: internal#724 Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/scripts/prod-auto-deploy.py | 63 +++++++++ .gitea/scripts/tests/test_prod_auto_deploy.py | 131 ++++++++++++++++++ .../publish-workspace-server-image.yml | 20 ++- 3 files changed, 211 insertions(+), 3 deletions(-) diff --git a/.gitea/scripts/prod-auto-deploy.py b/.gitea/scripts/prod-auto-deploy.py index 2dddd864b..31f905ddf 100644 --- a/.gitea/scripts/prod-auto-deploy.py +++ b/.gitea/scripts/prod-auto-deploy.py @@ -208,6 +208,61 @@ def _raise_for_redeploy_result(status: int, body: dict, slugs: list[str]) -> Non ) +def rollout_stragglers(enumerated: list[str], results: list[dict]) -> list[str]: + """Return every enumerated tenant NOT proven on the target build. + + A straggler is any tenant the rollout was supposed to cover that the + CP could not verify is running the target image tag — whether it + errored, was skipped, or SSM-succeeded onto the wrong image + (internal#724). CP marks each per-tenant result row with + ``verified_on_target`` (the REDEPLOY_RUNNING_IMAGE docker-inspect + proof). A tenant enumerated for the rollout but absent from the + result set (no batch ever ran it) is also a straggler — that is the + exact agents-team silent-skip class. + + Backward-compat: an OLDER CP that doesn't emit ``verified_on_target`` + yet returns rows without the key. Treat a missing key as verified so + this surfacing degrades to the previous (ok-based) behavior against an + un-upgraded CP, rather than failing every deploy spuriously. Once the + CP fix is deployed the key is always present and real stragglers are + caught. + """ + + verified: set[str] = set() + for row in results: + if str(row.get("ssm_status") or "") == "DryRun": + continue + slug = str(row.get("slug") or "").strip() + if not slug: + continue + # Missing key (old CP) => assume verified; present key is authoritative. + if "verified_on_target" not in row or row.get("verified_on_target"): + verified.add(slug) + return sorted(s for s in dict.fromkeys(enumerated) if s not in verified) + + +def assert_full_coverage(enumerated: list[str], aggregate: dict, dry_run: bool) -> None: + """Fail the rollout if any enumerated tenant is not on the target build. + + This is the no-silent-skip gate (internal#724). A dry run proves + nothing landed, so coverage is not asserted for it. + """ + + if dry_run: + return + stragglers = rollout_stragglers(enumerated, aggregate.get("results") or []) + if stragglers: + msg = ( + f"incomplete rollout: {len(stragglers)} tenant(s) not verified on target " + f"after redeploy-fleet: {', '.join(stragglers)} " + f"(enumerated {len(set(enumerated))})" + ) + aggregate["ok"] = False + aggregate["error"] = msg + aggregate["stragglers"] = stragglers + raise RolloutFailed(msg, aggregate) + + def execute_scoped_rollout( plan: dict, token: str, @@ -254,6 +309,14 @@ def execute_scoped_rollout( aggregate["error"] = str(exc) raise RolloutFailed(str(exc), aggregate) from exc + # No-silent-skip coverage gate (internal#724): every enumerated tenant + # must be PROVEN on the target build. A per-tenant HTTP-200/ok response + # is not proof — a tenant that SSM-succeeded but stayed on the old tag, + # or one enumerated but never batched, is a straggler. Surfacing it as + # a RolloutFailed makes the deploy step exit non-zero instead of + # silently reporting success (the exact agents-team failure mode). + assert_full_coverage(all_slugs, aggregate, dry_run) + return aggregate diff --git a/.gitea/scripts/tests/test_prod_auto_deploy.py b/.gitea/scripts/tests/test_prod_auto_deploy.py index f3e92548a..9808ffb33 100644 --- a/.gitea/scripts/tests/test_prod_auto_deploy.py +++ b/.gitea/scripts/tests/test_prod_auto_deploy.py @@ -355,3 +355,134 @@ def test_rollout_from_plan_file_writes_partial_response_on_failure(tmp_path): assert response_path.read_text(encoding="utf-8").strip() assert '"ok": false' in response_path.read_text(encoding="utf-8") assert '"slug": "hongming"' in response_path.read_text(encoding="utf-8") + + +# ────────────────────────────────────────────────────────────────────── +# No-silent-skip coverage gate (internal#724) +# ────────────────────────────────────────────────────────────────────── + + +def test_rollout_stragglers_flags_tenant_not_on_target(): + # b SSM-succeeded but its container is on the old tag → straggler. + stragglers = prod.rollout_stragglers( + ["a", "b", "c"], + [ + {"slug": "a", "verified_on_target": True}, + {"slug": "b", "verified_on_target": False, "running_image": "platform-tenant:staging-old"}, + {"slug": "c", "verified_on_target": True}, + ], + ) + assert stragglers == ["b"] + + +def test_rollout_stragglers_flags_enumerated_tenant_with_no_result(): + # agents-team class: enumerated but no batch ever produced a row for it. + stragglers = prod.rollout_stragglers( + ["a", "agents-team"], + [{"slug": "a", "verified_on_target": True}], + ) + assert stragglers == ["agents-team"] + + +def test_rollout_stragglers_missing_key_is_backward_compatible(): + # Older CP without verified_on_target → treat as verified (no spurious fail). + stragglers = prod.rollout_stragglers( + ["a", "b"], + [{"slug": "a", "healthz_ok": True}, {"slug": "b", "healthz_ok": True}], + ) + assert stragglers == [] + + +def test_rollout_stragglers_ignores_dry_run_rows(): + stragglers = prod.rollout_stragglers( + ["a"], [{"slug": "a", "ssm_status": "DryRun"}] + ) + # dry-run row is skipped, so "a" has no verifying row → straggler. + assert stragglers == ["a"] + + +def test_scoped_rollout_fails_when_a_tenant_stays_on_old_tag(): + # Every per-tenant call returns ok=True, but agents-team is NOT + # verified_on_target. The rollout must still fail loudly — this is + # the exact "reported success, one tenant silently skipped" bug. + def fake_redeploy(_cp_url, _token, body): + rows = [] + for slug in body["only_slugs"]: + rows.append({"slug": slug, "verified_on_target": slug != "agents-team"}) + return 200, {"ok": True, "results": rows} + + try: + prod.execute_scoped_rollout( + { + "cp_url": "https://api.moleculesai.app", + "body": { + "target_tag": "staging-new", + "batch_size": 5, + "dry_run": False, + "confirm": True, + }, + }, + token="secret", + list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"], + redeploy=fake_redeploy, + sleep=lambda _s: None, + ) + except prod.RolloutFailed as exc: + assert "incomplete rollout" in str(exc) + assert exc.response["stragglers"] == ["agents-team"] + assert exc.response["ok"] is False + else: + raise AssertionError("expected an incomplete rollout to fail loudly") + + +def test_scoped_rollout_passes_when_all_tenants_verified_on_target(): + def fake_redeploy(_cp_url, _token, body): + return 200, { + "ok": True, + "results": [{"slug": s, "verified_on_target": True} for s in body["only_slugs"]], + } + + aggregate = prod.execute_scoped_rollout( + { + "cp_url": "https://api.moleculesai.app", + "body": { + "target_tag": "staging-new", + "batch_size": 5, + "dry_run": False, + "confirm": True, + }, + }, + token="secret", + list_slugs=lambda _u, _t, _b: ["reno-stars", "agents-team", "hongming"], + redeploy=fake_redeploy, + sleep=lambda _s: None, + ) + assert aggregate["ok"] is True + assert "stragglers" not in aggregate + + +def test_scoped_rollout_dry_run_does_not_assert_coverage(): + # A dry run proves nothing landed; coverage must NOT be asserted or + # every plan would fail. + def fake_redeploy(_cp_url, _token, body): + return 200, { + "ok": True, + "results": [{"slug": s, "ssm_status": "DryRun"} for s in body["only_slugs"]], + } + + aggregate = prod.execute_scoped_rollout( + { + "cp_url": "https://api.moleculesai.app", + "body": { + "target_tag": "staging-new", + "batch_size": 5, + "dry_run": True, + "confirm": True, + }, + }, + token="secret", + list_slugs=lambda _u, _t, _b: ["a", "b"], + redeploy=fake_redeploy, + sleep=lambda _s: None, + ) + assert aggregate["ok"] is True diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 6c21a1e5a..4c716d71d 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -327,13 +327,27 @@ jobs: echo "" echo "### Per-tenant result" echo "" - echo "| Slug | Phase | SSM Status | Exit | Healthz | Error present |" - echo "|------|-------|------------|------|---------|---------------|" - jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \((.error // "") != "") |"' "$HTTP_RESPONSE" || true + echo "| Slug | Phase | SSM Status | Exit | Healthz | On target | Error present |" + echo "|------|-------|------------|------|---------|-----------|---------------|" + jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.verified_on_target) | \((.error // "") != "") |"' "$HTTP_RESPONSE" || true + # internal#724: stragglers are tenants enumerated but not proven + # on the target build. Surface them loudly — a non-empty list + # means the rollout did NOT fully land. + STRAGGLERS="$(jq -r '(.stragglers // []) | join(", ")' "$HTTP_RESPONSE")" + if [ -n "$STRAGGLERS" ]; then + echo "" + echo "### ⚠ Stragglers (NOT on target tag \`$TARGET_TAG\`)" + echo "" + echo "\`$STRAGGLERS\`" + fi } >> "$GITHUB_STEP_SUMMARY" OK="$(jq -r '.ok' "$HTTP_RESPONSE")" if [ "$OK" != "true" ]; then + STRAGGLERS="$(jq -r '(.stragglers // []) | join(", ")' "$HTTP_RESPONSE")" + if [ -n "$STRAGGLERS" ]; then + echo "::error::incomplete rollout — tenants not on target tag $TARGET_TAG: $STRAGGLERS" + fi echo "::error::redeploy-fleet reported ok=false; production rollout halted." exit 1 fi -- 2.52.0