Split production auto-deploy into scoped redeploy calls #1838

Merged
hongming merged 1 commits from fix/prod-auto-deploy-scoped-rollout into main 2026-05-25 07:56:46 +00:00
3 changed files with 369 additions and 20 deletions
+155
View File
@@ -25,6 +25,7 @@ DEFAULT_REQUIRED_CONTEXTS = [
"Secret scan / Scan diff for credential-shaped strings (push)",
]
TERMINAL_FAILURE_STATES = {"failure", "error", "cancelled", "canceled", "skipped"}
REDEPLOY_PATH = "/cp/admin/tenants/redeploy-fleet"
def truthy_flag(value: str | None) -> bool:
@@ -130,6 +131,154 @@ def required_contexts(env: dict[str, str]) -> list[str]:
return [line.strip() for line in raw.replace(",", "\n").splitlines() if line.strip()]
def chunks(items: list[str], size: int) -> list[list[str]]:
return [items[i : i + size] for i in range(0, len(items), size)]
class RolloutFailed(RuntimeError):
def __init__(self, message: str, response: dict):
super().__init__(message)
self.response = response
def slugs_from_redeploy_response(body: dict) -> list[str]:
slugs: list[str] = []
for row in body.get("results") or []:
slug = str(row.get("slug") or "").strip()
if slug:
slugs.append(slug)
return slugs
def scoped_redeploy_body(base: dict, slugs: list[str]) -> dict:
body = dict(base)
body.pop("canary_slug", None)
body["only_slugs"] = slugs
body["soak_seconds"] = 0
body["batch_size"] = max(1, len(slugs))
return body
def cp_api_json(method: str, url: str, token: str, body: dict | None = None) -> tuple[int, dict]:
data = None
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/json",
}
if body is not None:
data = json.dumps(body).encode("utf-8")
headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, data=data, headers=headers, method=method)
try:
with urllib.request.urlopen(req, timeout=120) as resp:
return resp.status, json.loads(resp.read())
except urllib.error.HTTPError as exc:
raw = exc.read().decode("utf-8", errors="replace")
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
parsed = {"error": raw[:500]}
return exc.code, parsed
def plan_rollout_slugs(cp_url: str, token: str, body: dict, redeploy=None) -> list[str]:
if redeploy is None:
redeploy = redeploy_scoped
dry_run_body = dict(body)
dry_run_body["dry_run"] = True
status, resp = redeploy(cp_url, token, dry_run_body)
if status != 200:
raise RuntimeError(f"dry-run redeploy-fleet returned HTTP {status}: {resp.get('error', '')}")
if resp.get("ok") is not True:
raise RuntimeError(f"dry-run redeploy-fleet reported ok={resp.get('ok')}: {resp.get('error', '')}")
slugs = slugs_from_redeploy_response(resp)
if not slugs:
raise RuntimeError("dry-run redeploy-fleet returned no rollout candidates")
return slugs
def redeploy_scoped(cp_url: str, token: str, body: dict) -> tuple[int, dict]:
return cp_api_json("POST", f"{cp_url}{REDEPLOY_PATH}", token, body)
def _raise_for_redeploy_result(status: int, body: dict, slugs: list[str]) -> None:
if status != 200 or body.get("ok") is not True:
raise RuntimeError(
"redeploy scoped call failed for "
f"{','.join(slugs)}: HTTP {status}, ok={body.get('ok')}"
)
def execute_scoped_rollout(
plan: dict,
token: str,
list_slugs=plan_rollout_slugs,
redeploy=redeploy_scoped,
sleep=time.sleep,
) -> dict:
cp_url = plan["cp_url"]
base_body = plan["body"]
all_slugs = list_slugs(cp_url, token, base_body)
batch_size = int(base_body.get("batch_size") or 1)
canary_slug = str(base_body.get("canary_slug") or "").strip()
dry_run = bool(base_body.get("dry_run"))
aggregate = {"ok": True, "results": []}
if canary_slug:
if canary_slug not in all_slugs:
raise RuntimeError(f"configured canary slug {canary_slug!r} is not a running tenant")
body = scoped_redeploy_body(base_body, [canary_slug])
print(f"POST {cp_url}{REDEPLOY_PATH} only_slugs={','.join(body['only_slugs'])}")
status, resp = redeploy(cp_url, token, body)
aggregate["results"].extend(resp.get("results") or [])
try:
_raise_for_redeploy_result(status, resp, [canary_slug])
except RuntimeError as exc:
aggregate["ok"] = False
aggregate["error"] = str(exc)
raise RolloutFailed(str(exc), aggregate) from exc
soak_seconds = int(base_body.get("soak_seconds") or 0)
if soak_seconds > 0 and not dry_run:
print(f"Canary passed; soaking locally for {soak_seconds}s")
sleep(soak_seconds)
remaining = [slug for slug in all_slugs if slug != canary_slug]
for group in chunks(remaining, batch_size):
body = scoped_redeploy_body(base_body, group)
print(f"POST {cp_url}{REDEPLOY_PATH} only_slugs={','.join(group)}")
status, resp = redeploy(cp_url, token, body)
aggregate["results"].extend(resp.get("results") or [])
try:
_raise_for_redeploy_result(status, resp, group)
except RuntimeError as exc:
aggregate["ok"] = False
aggregate["error"] = str(exc)
raise RolloutFailed(str(exc), aggregate) from exc
return aggregate
def rollout_from_plan_file(plan_path: str, response_path: str, env: dict[str, str]) -> None:
token = env.get("CP_ADMIN_API_TOKEN", "").strip()
if not token:
raise ValueError("CP_ADMIN_API_TOKEN is required for production auto-deploy")
with open(plan_path, "r", encoding="utf-8") as fh:
plan = json.load(fh)
if not plan.get("enabled"):
raise RuntimeError("production auto-deploy plan is disabled")
try:
response = execute_scoped_rollout(plan, token)
except RolloutFailed as exc:
response = exc.response
with open(response_path, "w", encoding="utf-8") as fh:
json.dump(response, fh, sort_keys=True)
fh.write("\n")
raise
with open(response_path, "w", encoding="utf-8") as fh:
json.dump(response, fh, sort_keys=True)
fh.write("\n")
def _api_json(url: str, token: str) -> dict:
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
try:
@@ -231,6 +380,9 @@ def main() -> int:
sub.add_parser("plan", help="print production deploy plan as JSON")
sub.add_parser("assert-enabled", help="fail if production deploy is currently disabled")
sub.add_parser("wait-ci", help="block until required CI context is green")
rollout_parser = sub.add_parser("rollout", help="execute canary-first scoped production rollout")
rollout_parser.add_argument("--plan", required=True, help="path to prod-auto-deploy plan JSON")
rollout_parser.add_argument("--response", required=True, help="path to write aggregate response JSON")
args = parser.parse_args()
try:
@@ -243,6 +395,9 @@ def main() -> int:
if args.command == "wait-ci":
wait_for_ci_context(dict(os.environ))
return 0
if args.command == "rollout":
rollout_from_plan_file(args.plan, args.response, dict(os.environ))
return 0
except Exception as exc: # noqa: BLE001 - CLI should render operator-friendly errors.
print(f"::error::{exc}", file=sys.stderr)
return 1
@@ -153,3 +153,205 @@ def test_default_required_contexts_delegate_path_gating_to_all_required():
"CI / all-required (push)",
"Secret scan / Scan diff for credential-shaped strings (push)",
]
def test_slugs_from_redeploy_response_uses_controlplane_plan_rows():
body = {
"results": [
{"slug": "hongming", "phase": "canary", "ssm_status": "DryRun"},
{"slug": "tenant-a", "phase": "batch-1", "ssm_status": "DryRun"},
{"slug": "", "phase": "batch-1", "ssm_status": "DryRun"},
{"phase": "batch-1", "ssm_status": "DryRun"},
]
}
assert prod.slugs_from_redeploy_response(body) == ["hongming", "tenant-a"]
def test_plan_rollout_slugs_asks_controlplane_for_dry_run_plan():
calls = []
def fake_redeploy(_cp_url, _token, body):
calls.append(body)
return 200, {
"ok": True,
"results": [
{"slug": "hongming", "phase": "canary", "ssm_status": "DryRun"},
{"slug": "tenant-a", "phase": "batch-1", "ssm_status": "DryRun"},
],
}
slugs = prod.plan_rollout_slugs(
"https://api.moleculesai.app",
"secret",
{
"target_tag": "staging-abcdef1",
"canary_slug": "hongming",
"soak_seconds": 60,
"batch_size": 3,
"dry_run": False,
"confirm": True,
},
redeploy=fake_redeploy,
)
assert slugs == ["hongming", "tenant-a"]
assert calls == [
{
"target_tag": "staging-abcdef1",
"canary_slug": "hongming",
"soak_seconds": 60,
"batch_size": 3,
"dry_run": True,
"confirm": True,
}
]
def test_scoped_redeploy_body_removes_canary_and_local_soak():
base = {
"target_tag": "staging-abcdef1",
"canary_slug": "hongming",
"soak_seconds": 60,
"batch_size": 3,
"dry_run": False,
"confirm": True,
}
scoped = prod.scoped_redeploy_body(base, ["tenant-a", "tenant-b"])
assert scoped == {
"target_tag": "staging-abcdef1",
"soak_seconds": 0,
"batch_size": 2,
"dry_run": False,
"confirm": True,
"only_slugs": ["tenant-a", "tenant-b"],
}
def test_plan_scoped_rollout_preserves_canary_then_batches():
calls, sleeps = [], []
def fake_list(_cp_url, _token, _body):
return ["tenant-a", "hongming", "tenant-b", "tenant-c"]
def fake_redeploy(_cp_url, _token, body):
calls.append(body)
return 200, {
"ok": True,
"results": [{"slug": slug, "healthz_ok": True} for slug in body["only_slugs"]],
}
aggregate = prod.execute_scoped_rollout(
{
"cp_url": "https://api.moleculesai.app",
"body": {
"target_tag": "staging-abcdef1",
"canary_slug": "hongming",
"soak_seconds": 60,
"batch_size": 2,
"dry_run": False,
"confirm": True,
},
},
token="secret",
list_slugs=fake_list,
redeploy=fake_redeploy,
sleep=sleeps.append,
)
assert [call["only_slugs"] for call in calls] == [
["hongming"],
["tenant-a", "tenant-b"],
["tenant-c"],
]
assert sleeps == [60]
assert aggregate["ok"] is True
assert [result["slug"] for result in aggregate["results"]] == [
"hongming",
"tenant-a",
"tenant-b",
"tenant-c",
]
def test_scoped_rollout_halts_after_failed_canary():
calls = []
def fake_redeploy(_cp_url, _token, body):
calls.append(body)
return 200, {"ok": False, "results": [{"slug": body["only_slugs"][0], "error": "bad"}]}
try:
prod.execute_scoped_rollout(
{
"cp_url": "https://api.moleculesai.app",
"body": {
"target_tag": "staging-abcdef1",
"canary_slug": "hongming",
"soak_seconds": 60,
"batch_size": 2,
"dry_run": False,
"confirm": True,
},
},
token="secret",
list_slugs=lambda _cp_url, _token, _body: ["hongming", "tenant-a"],
redeploy=fake_redeploy,
sleep=lambda _seconds: None,
)
except prod.RolloutFailed as exc:
assert "redeploy scoped call failed" in str(exc)
assert exc.response["ok"] is False
assert exc.response["results"] == [{"slug": "hongming", "error": "bad"}]
else:
raise AssertionError("expected failed canary to halt rollout")
assert [call["only_slugs"] for call in calls] == [["hongming"]]
def test_rollout_from_plan_file_writes_partial_response_on_failure(tmp_path):
plan_path = tmp_path / "plan.json"
response_path = tmp_path / "response.json"
plan_path.write_text(
"""
{
"enabled": true,
"cp_url": "https://api.moleculesai.app",
"body": {"target_tag": "staging-abcdef1", "confirm": true}
}
""",
encoding="utf-8",
)
original = prod.execute_scoped_rollout
def fake_execute(_plan, _token):
raise prod.RolloutFailed(
"redeploy scoped call failed for hongming: HTTP 500, ok=false",
{
"ok": False,
"error": "redeploy scoped call failed for hongming: HTTP 500, ok=false",
"results": [{"slug": "hongming", "error": "bad"}],
},
)
prod.execute_scoped_rollout = fake_execute
try:
try:
prod.rollout_from_plan_file(
str(plan_path),
str(response_path),
{"CP_ADMIN_API_TOKEN": "secret"},
)
except prod.RolloutFailed:
pass
else:
raise AssertionError("expected rollout failure")
finally:
prod.execute_scoped_rollout = original
assert response_path.read_text(encoding="utf-8").strip()
assert '"ok": false' in response_path.read_text(encoding="utf-8")
assert '"slug": "hongming"' in response_path.read_text(encoding="utf-8")
@@ -303,26 +303,19 @@ jobs:
python3 .gitea/scripts/prod-auto-deploy.py assert-enabled
PLAN="$RUNNER_TEMP/prod-auto-deploy-plan.json"
TARGET_TAG="$(jq -r '.target_tag' "$PLAN")"
BODY="$(jq -c '.body' "$PLAN")"
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " target_tag: $TARGET_TAG"
echo " body: $BODY"
HTTP_RESPONSE="$RUNNER_TEMP/prod-redeploy-response.json"
HTTP_CODE_FILE="$RUNNER_TEMP/prod-redeploy-http-code.txt"
set +e
curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" > "$HTTP_CODE_FILE"
python3 .gitea/scripts/prod-auto-deploy.py rollout \
--plan "$PLAN" \
--response "$HTTP_RESPONSE"
ROLLOUT_EXIT=$?
set -e
HTTP_CODE="$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")"
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
echo "HTTP $HTTP_CODE"
if [ ! -s "$HTTP_RESPONSE" ]; then
jq -nc --arg error "rollout command exited $ROLLOUT_EXIT before writing a response" \
'{ok:false, results:[], error:$error}' > "$HTTP_RESPONSE"
fi
jq '{ok, result_count: (.results // [] | length)}' "$HTTP_RESPONSE" || true
{
@@ -330,7 +323,6 @@ jobs:
echo ""
echo "**Commit:** \`${GITHUB_SHA:0:7}\`"
echo "**Target tag:** \`$TARGET_TAG\`"
echo "**HTTP:** $HTTP_CODE"
echo ""
echo "### Per-tenant result"
echo ""
@@ -339,15 +331,15 @@ jobs:
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \((.error // "") != "") |"' "$HTTP_RESPONSE" || true
} >> "$GITHUB_STEP_SUMMARY"
if [ "$HTTP_CODE" != "200" ]; then
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
exit 1
fi
OK="$(jq -r '.ok' "$HTTP_RESPONSE")"
if [ "$OK" != "true" ]; then
echo "::error::redeploy-fleet reported ok=false; production rollout halted."
exit 1
fi
if [ "$ROLLOUT_EXIT" -ne 0 ]; then
echo "::error::redeploy-fleet rollout failed with exit code $ROLLOUT_EXIT."
exit "$ROLLOUT_EXIT"
fi
- name: Verify reachable tenants report this SHA
if: ${{ steps.plan.outputs.enabled == 'true' }}