diff --git a/.gitea/scripts/status-reaper.py b/.gitea/scripts/status-reaper.py index 7047a7fc..ca8741c8 100644 --- a/.gitea/scripts/status-reaper.py +++ b/.gitea/scripts/status-reaper.py @@ -133,6 +133,9 @@ PUSH_COMPENSATION_DESCRIPTION = ( "Compensated by status-reaper (workflow has no push: trigger; " "Gitea 1.22.6 hardcoded-suffix bug — see .gitea/scripts/status-reaper.py)" ) +# Backward-compatible alias for older tests/tooling that predate the split +# between push-suffix compensation and pull-request-shadow compensation. +COMPENSATION_DESCRIPTION = PUSH_COMPENSATION_DESCRIPTION PR_SHADOW_COMPENSATION_DESCRIPTION = ( "Compensated by status-reaper (default-branch pull_request status " "shadowed by successful push status on same SHA; see " @@ -746,12 +749,32 @@ def main() -> int: f"class-O candidates={sum(1 for v in workflow_trigger_map.values() if not v)}" ) - counters = reap_branch( - workflow_trigger_map, - WATCH_BRANCH, - limit=args.limit, - dry_run=args.dry_run, - ) + try: + counters = reap_branch( + workflow_trigger_map, + WATCH_BRANCH, + limit=args.limit, + dry_run=args.dry_run, + ) + except ApiError as e: + print( + "::warning::status-reaper skipped this tick because the " + f"commit list could not be read after retries: {e}" + ) + counters = { + "scanned_shas": 0, + "compensated": 0, + "preserved_real_push": 0, + "preserved_unknown": 0, + "preserved_non_failure": 0, + "preserved_non_push_suffix": 0, + "preserved_unparseable": 0, + "compensated_pr_shadowed_by_push_success": 0, + "preserved_pr_without_push_success": 0, + "compensated_per_sha": {}, + "skipped": True, + "skip_reason": "commit-list-api-error", + } # Observability: print one JSON line summarising the tick. Loki # ingestion via the runner's stdout (`source="gitea-actions"`). diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 0411e149..259df556 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -9,19 +9,17 @@ name: redeploy-tenants-on-main # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. # - `continue-on-error: true` on each job (RFC §1 contract). -# - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with -# push+paths filter per this PR. Gitea 1.22.6 does not support -# `workflow_run` (task #81). The push trigger fires on every -# commit to publish-workspace-server-image.yml which is the -# same signal (only successful runs commit to main). +# - Dropped unsupported `workflow_run` (task #81). +# - Later changed to manual-only after publish-workspace-server-image.yml +# gained an integrated ordered production deploy job. # -# Auto-refresh prod tenant EC2s after every main merge. +# Manual production tenant redeploy/rollback helper. # -# Why this workflow exists: publish-workspace-server-image builds and -# pushes a new platform-tenant : to ECR on every merge to main, -# but running tenants pulled their image once at boot and never re-pull. -# Users see stale code indefinitely. +# Why this workflow is manual-only: publish-workspace-server-image now owns +# the ordered build -> push -> production auto-deploy sequence in one workflow. +# A separate push-triggered redeploy workflow races before the new ECR image +# exists and can paint main red with a false deployment failure. # # This workflow closes the gap by calling the control-plane admin # endpoint that performs a canary-first, batched, health-gated rolling @@ -34,16 +32,11 @@ name: redeploy-tenants-on-main # Gitea suspension migration. The staging-verify.yml promote step now # uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap). # -# Runtime ordering: -# 1. publish-workspace-server-image completes → new :staging- in ECR. -# 2. The merge that updates publish-workspace-server-image.yml triggers -# this push/path-filtered workflow, which calls redeploy-fleet with -# target_tag=staging-. No CDN propagation wait needed — ECR image -# manifest is consistent immediately after push. -# 3. Calls redeploy-fleet with canary_slug (if set) and a soak -# period. Canary proves the image boots; batches follow. -# 4. Any failure aborts the rollout and leaves older tenants on the -# prior image — safer default than half-and-half state. +# Runtime ordering for automatic deploys now lives in +# publish-workspace-server-image.yml: +# 1. build-and-push creates new :staging- images in ECR. +# 2. deploy-production waits for required push contexts on that SHA. +# 3. deploy-production calls redeploy-fleet canary-first. # # Rollback path: set PROD_MANUAL_REDEPLOY_TARGET_TAG as a repo/org # variable or secret, run workflow_dispatch, then unset it after the @@ -51,21 +44,14 @@ name: redeploy-tenants-on-main # re-pulling the pinned image on every tenant. on: - push: - branches: [main] - paths: - - '.gitea/workflows/publish-workspace-server-image.yml' workflow_dispatch: permissions: contents: read # No write scopes needed — the workflow hits an external CP endpoint, # not the GitHub API. -# Serialize redeploys so two rapid main pushes' redeploys don't overlap -# and cause confusing per-tenant SSM state. Without this, GitHub's -# implicit workflow_run queueing would *probably* serialize them, but -# the explicit block makes the invariant defensible. Mirrors the -# concurrency block on redeploy-tenants-on-staging.yml for shape parity. +# Serialize manual redeploys so two operator-triggered rollbacks do not +# overlap and cause confusing per-tenant SSM state. # # NOTE: cancel-in-progress: false removed (Rule 7 fix). Gitea 1.22.6 # cancels queued runs regardless of this setting, so it provides no @@ -81,18 +67,15 @@ env: jobs: # bp-exempt: production redeploy is a side-effect workflow, not a merge gate. redeploy: - # Gitea 1.22.6 does not support workflow_run. This workflow is now - # controlled by push/path triggers plus an explicit kill switch. - if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} + if: ${{ github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 25 env: - # Rule 9 fix: operational kill switch for auto-triggered deployments. - # Set repo variable or secret PROD_AUTO_DEPLOY_DISABLED=true to prevent - # this workflow from redeploying. Manual workflow_dispatch bypasses this. + # Rule 9 fix: keep the same operational kill switch surface as the + # integrated auto-deploy workflow. PROD_AUTO_DEPLOY_DISABLED: ${{ vars.PROD_AUTO_DEPLOY_DISABLED || secrets.PROD_AUTO_DEPLOY_DISABLED || '' }} steps: - name: Kill-switch guard @@ -114,13 +97,8 @@ jobs: # tag) → used verbatim. Lets ops pin `latest` for emergency # rollback to last canary-verified digest, or pin a specific # `staging-` to roll back to a known-good build. - # 2. Default → `staging-`. The just-published - # digest. Bypasses the `:latest` retag path that's currently - # dead (staging-verify soft-skips without canary fleet, so - # the only thing retagging `:latest` today is the manual - # promote-latest.yml — last run 2026-04-28). Auto-trigger - # from the main push uses github.sha; manual - # dispatch with no variable falls through to github.sha. + # 2. Default → `staging-` for manual reruns from + # the current default-branch SHA. env: PROD_MANUAL_REDEPLOY_TARGET_TAG: ${{ vars.PROD_MANUAL_REDEPLOY_TARGET_TAG || secrets.PROD_MANUAL_REDEPLOY_TARGET_TAG || '' }} HEAD_SHA: ${{ github.sha }} @@ -274,13 +252,11 @@ jobs: # fail the workflow, which is what `ok=true` should have # guaranteed all along. # - # When the redeploy was triggered by workflow_dispatch with a - # specific tag (target_tag != "latest"), the expected SHA may - # not equal ${{ github.sha }} — in that case we resolve via - # GHCR's manifest. For workflow_run (default :latest) the - # workflow_run.head_sha is the SHA that just published. + # When the redeploy is triggered manually with a specific tag + # (target_tag != "latest"), the expected SHA may not equal + # ${{ github.sha }}. env: - EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} + EXPECTED_SHA: ${{ github.sha }} TARGET_TAG: ${{ steps.tag.outputs.target_tag }} # Tenant subdomain template — slugs from the response are # appended. Production CP issues `.moleculesai.app`; diff --git a/tests/test_status_reaper.py b/tests/test_status_reaper.py index 81327487..717c50f1 100644 --- a/tests/test_status_reaper.py +++ b/tests/test_status_reaper.py @@ -495,7 +495,7 @@ def test_reap_required_check_pull_request_suffix_never_touched(sr_module, monkey } counters = sr_module.reap(workflow_map, combined, SHA, dry_run=False) assert counters["compensated"] == 0 - assert counters["preserved_non_push_suffix"] == 1 + assert counters["preserved_pr_without_push_success"] == 1 assert calls == [] @@ -1009,3 +1009,29 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys): captured = capsys.readouterr() assert "::warning::" in captured.out or "::notice::" in captured.out assert SHA_A[:10] in captured.out + + +def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, capsys): + """A transient outage while listing recent commits should not paint main red. + + Per-SHA status read failures are already isolated inside `reap_branch`. + The real 2026-05-14 failure was earlier: `/commits?sha=main&limit=30` + timed out after all retries, aborting the tick. The next 5-minute tick can + retry safely, so `main()` should emit an observable warning and return 0. + """ + + monkeypatch.setattr(sr_module, "scan_workflows", lambda _: {"status-reaper": False}) + + def fake_reap_branch(*args, **kwargs): + raise sr_module.ApiError( + "GET /repos/owner/repo/commits failed after 4 attempts: timed out" + ) + + monkeypatch.setattr(sr_module, "reap_branch", fake_reap_branch) + monkeypatch.setattr(sys, "argv", ["status-reaper.py"]) + + assert sr_module.main() == 0 + captured = capsys.readouterr() + assert "::warning::status-reaper skipped this tick" in captured.out + assert '"skipped": true' in captured.out + assert '"skip_reason": "commit-list-api-error"' in captured.out