diff --git a/.gitea/scripts/prod-auto-deploy.py b/.gitea/scripts/prod-auto-deploy.py index 31f905ddf..06b6c8f8c 100644 --- a/.gitea/scripts/prod-auto-deploy.py +++ b/.gitea/scripts/prod-auto-deploy.py @@ -364,6 +364,71 @@ def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]: return exc.code, None +def current_branch_head(env: dict[str, str]) -> str | None: + """Return the SHA at the tip of the deploy branch (main) per Gitea, or None. + + Used to detect a *superseded* deploy job (see `superseded_by`). Fail-safe: + any read error / missing token returns None so the caller treats the job as + NOT superseded and the strict /buildinfo verify still runs. We never let an + unreadable head silently green a deploy. + """ + + token = env.get("GITEA_TOKEN", "").strip() + if not token: + return None + host = env.get("GITEA_HOST", "git.moleculesai.app") + repo = env.get("GITHUB_REPOSITORY", "molecule-ai/molecule-core") + # Deploy lane is on: push:main; the branch is always main here, but read it + # from the ref name when present so a future branch rename doesn't break us. + branch = env.get("GITHUB_REF_NAME", "").strip() or "main" + url = f"https://{host}/api/v1/repos/{repo}/branches/{quote(branch, safe='')}" + status, body = _api_json_optional(url, token) + if status != 200 or not isinstance(body, dict): + return None + commit = body.get("commit") + if isinstance(commit, dict): + head = commit.get("id") or commit.get("sha") + if isinstance(head, str) and head.strip(): + return head.strip() + return None + + +def superseded_by(env: dict[str, str]) -> str | None: + """Return the newer head SHA if THIS deploy job has been superseded, else None. + + This workflow runs with no `concurrency:` (intentional — Gitea 1.22.6 cancels + queued runs, which is unacceptable for a prod deploy). When two main pushes + land close together, BOTH deploy-production jobs run. The newer push rolls the + fleet forward first; the OLDER job's strict /buildinfo verify then sees tenants + on the NEWER SHA and false-reds with "$slug is stale" — even though the fleet + is AHEAD, not behind. Git SHAs aren't ordered, so the verify can't tell ahead + from behind on its own (and /buildinfo exposes only git_sha, no build time). + + Resolve it at the source of truth for ordering — the branch ref: if main's + current head is a DIFFERENT SHA than the one this job is deploying, a newer + commit has landed and this job is superseded; the newest job's verify is the + authoritative one. We return that head SHA so the caller can log it and exit + success early, skipping the strict-equality verify for this stale job. + + Fail-safe: returns None (NOT superseded) when the head can't be read or equals + our SHA, so a genuinely-behind tenant under the LATEST deploy job still fails + the strict verify loudly. This never suppresses a real-stale signal — it only + excuses a job that is no longer the latest from asserting exact equality. + """ + + sha = env.get("GITHUB_SHA", "").strip() + if not sha: + return None + head = current_branch_head(env) + if not head: + return None + # SHA lengths can differ (short vs full); compare on the shorter prefix. + n = min(len(head), len(sha)) + if head[:n].lower() == sha[:n].lower(): + return None + return head + + def live_disable_flag(env: dict[str, str]) -> str: """Return a live disable value from Gitea variables when readable. @@ -442,6 +507,14 @@ def main() -> int: sub.add_parser("plan", help="print production deploy plan as JSON") sub.add_parser("assert-enabled", help="fail if production deploy is currently disabled") sub.add_parser("wait-ci", help="block until required CI context is green") + sub.add_parser( + "check-superseded", + help=( + "exit 0 if a newer commit has landed on the deploy branch (this job " + "is superseded; prints the newer head SHA), exit 10 if this job is " + "still the latest" + ), + ) rollout_parser = sub.add_parser("rollout", help="execute canary-first scoped production rollout") rollout_parser.add_argument("--plan", required=True, help="path to prod-auto-deploy plan JSON") rollout_parser.add_argument("--response", required=True, help="path to write aggregate response JSON") @@ -457,6 +530,16 @@ def main() -> int: if args.command == "wait-ci": wait_for_ci_context(dict(os.environ)) return 0 + if args.command == "check-superseded": + newer = superseded_by(dict(os.environ)) + if newer: + print(newer) + return 0 + # Exit 10 (not 0, not 1): "this job is still the latest". The + # workflow treats only exit 0 as superseded; 10 means proceed to + # the strict verify. A non-zero code here is informational, not a + # failure — the workflow step swallows it. + return 10 if args.command == "rollout": rollout_from_plan_file(args.plan, args.response, dict(os.environ)) return 0 diff --git a/.gitea/scripts/tests/test_prod_auto_deploy.py b/.gitea/scripts/tests/test_prod_auto_deploy.py index 9808ffb33..f584d6596 100644 --- a/.gitea/scripts/tests/test_prod_auto_deploy.py +++ b/.gitea/scripts/tests/test_prod_auto_deploy.py @@ -486,3 +486,88 @@ def test_scoped_rollout_dry_run_does_not_assert_coverage(): sleep=lambda _s: None, ) assert aggregate["ok"] is True + + +# --- Superseded-deploy guard (false-stale fix) ----------------------------- +# +# Scenario this fixes: no `concurrency:` on the prod-deploy workflow means two +# close main pushes run BOTH deploy-production jobs. eb31bcf (Fix A) and 286338 +# (Fix C) merge back-to-back; the 286338 job rolls the fleet to staging-2863380 +# first; the OLDER eb31bcf job's strict verify then sees tenants on 2863380 and +# false-reds "stale" though the fleet is AHEAD. superseded_by detects that main's +# head is no longer eb31bcf and lets the older job succeed without weakening the +# behind-tenant signal for whichever job IS the latest. + + +def test_superseded_by_returns_newer_head_when_main_moved_ahead(monkeypatch): + # eb31bcf job: main head is now 2863380 -> superseded, return the newer head. + monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash") + newer = prod.superseded_by({"GITHUB_SHA": "eb31bcffullhash"}) + assert newer == "2863380fullhash" + + +def test_superseded_by_none_when_this_job_is_still_head(monkeypatch): + # 2863380 job (the latest): head == our SHA -> NOT superseded -> strict verify + # runs, so a genuinely-behind tenant still fails loudly. + monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash") + assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None + + +def test_superseded_by_matches_on_short_vs_full_sha_prefix(monkeypatch): + # GITHUB_SHA is full; Gitea may return a different-length id. Equal prefixes + # must NOT count as superseded (avoid false-skipping the real latest job). + monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380") + assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None + monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380FULLHASH") + assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None + + +def test_superseded_by_fail_safe_returns_none_when_head_unreadable(monkeypatch): + # Fail-safe: unreadable head (no token / API error) must NOT be treated as + # superseded, so the strict verify still runs and never silently greens. + monkeypatch.setattr(prod, "current_branch_head", lambda _env: None) + assert prod.superseded_by({"GITHUB_SHA": "eb31bcffullhash"}) is None + + +def test_superseded_by_none_without_github_sha(monkeypatch): + monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash") + assert prod.superseded_by({}) is None + + +def test_current_branch_head_parses_gitea_branch_commit_id(monkeypatch): + captured = {} + + def fake_optional(url, _token): + captured["url"] = url + return 200, {"name": "main", "commit": {"id": "2863380fullhash"}} + + monkeypatch.setattr(prod, "_api_json_optional", fake_optional) + head = prod.current_branch_head( + {"GITEA_TOKEN": "secret", "GITHUB_REPOSITORY": "molecule-ai/molecule-core"} + ) + assert head == "2863380fullhash" + assert captured["url"].endswith("/repos/molecule-ai/molecule-core/branches/main") + + +def test_current_branch_head_uses_ref_name_branch(monkeypatch): + captured = {} + + def fake_optional(url, _token): + captured["url"] = url + return 200, {"commit": {"sha": "deadbeef"}} + + monkeypatch.setattr(prod, "_api_json_optional", fake_optional) + head = prod.current_branch_head( + {"GITEA_TOKEN": "secret", "GITHUB_REF_NAME": "release"} + ) + assert head == "deadbeef" + assert captured["url"].endswith("/branches/release") + + +def test_current_branch_head_none_without_token(): + assert prod.current_branch_head({}) is None + + +def test_current_branch_head_none_on_non_200(monkeypatch): + monkeypatch.setattr(prod, "_api_json_optional", lambda _u, _t: (500, None)) + assert prod.current_branch_head({"GITEA_TOKEN": "secret"}) is None diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 1c8f0e036..260479d05 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -373,6 +373,36 @@ jobs: run: | set -euo pipefail RESP="$RUNNER_TEMP/prod-redeploy-response.json" + + # Superseded-job guard. This workflow has no `concurrency:` (header + # explains why: Gitea 1.22.6 cancels queued prod deploys). So two + # close main pushes run BOTH deploy-production jobs. The newer one + # rolls the fleet to its (newer) build first; this older job's strict + # equality check below would then see tenants on the NEWER SHA and + # false-red "$slug is stale" even though the fleet is AHEAD, not + # behind (git SHAs aren't ordered; /buildinfo exposes only git_sha). + # + # If main's current head is no longer THIS job's SHA, a newer commit + # has landed and this deploy is superseded — the newest job's verify + # is authoritative. Skip strict verify and succeed. exit 0 => newer + # head printed (superseded); exit 10 => still the latest, proceed to + # the strict verify so a genuinely-behind tenant still fails loudly. + set +e + NEWER_HEAD="$(python3 .gitea/scripts/prod-auto-deploy.py check-superseded)" + SUPERSEDED_EXIT=$? + set -e + if [ "$SUPERSEDED_EXIT" -eq 0 ] && [ -n "$NEWER_HEAD" ]; then + echo "::notice::Superseded deploy: main head is now ${NEWER_HEAD:0:7} (this job deployed ${GITHUB_SHA:0:7}). The fleet is at or ahead of this build; the newer deploy job's verify is authoritative. Skipping strict SHA verify." + { + echo "" + echo "### Buildinfo verification skipped — superseded deploy" + echo "" + echo "This deploy job's SHA \`${GITHUB_SHA:0:7}\` is no longer the head of \`main\` (now \`${NEWER_HEAD:0:7}\`)." + echo "A newer deploy job is rolling the fleet forward; its verify is authoritative." + } >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + mapfile -t SLUGS < <(jq -r '.results[]? | .slug' "$RESP") if [ ${#SLUGS[@]} -eq 0 ]; then echo "::error::No tenants returned from redeploy-fleet; refusing to mark production deploy verified."