From 16957b7c156bde7b62c5e5ce5c1082e34dedcb5b Mon Sep 17 00:00:00 2001 From: infra-sre Date: Sat, 16 May 2026 11:49:10 -0700 Subject: [PATCH] infra(ci): route publish/deploy ship jobs to dedicated `publish` lane (internal#462) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Urgent prod-deploy publish builds currently FIFO-compete with ordinary PR required-CI on the shared 20-runner pool. PR#1350's (CTO-reported canvas-message-loss fix) production image build sat ~25min behind the PR-CI backlog after merge, directly delaying a user-facing fix. internal#462 comment 32299 + the already-merged operator-config publish-lane scaffolding (config.publish.yaml + publish-lane-ensure.sh, internal#394/#399) define a reserved `publish`/`release` sub-pool (molecule-runner-publish-*, OUTSIDE the managed 1..20 range so it is never auto-drained / recycled / drift-flagged). This retargets the 7 post-merge ship jobs across 5 workflows from `runs-on: ubuntu-latest` to `runs-on: publish` so a merged fix's image build/push/deploy gets reserved capacity and starts immediately, while PR-CI keeps the general pool: - publish-workspace-server-image.yml: build-and-push, deploy-production - publish-canvas-image.yml: build-and-push - publish-runtime.yml: publish, cascade - redeploy-tenants-on-main.yml: redeploy - redeploy-tenants-on-staging.yml: redeploy publish-runtime-autobump.yml is intentionally NOT moved: it is pull_request-triggered (PR-CI by nature, a required status), not a post-merge ship job — the lane reserves capacity for the ship path, not for PR checks. HARD MERGE PRECONDITION: this MUST NOT merge until the publish-lane runners are registered and advertising the `publish` label. Targeting an unregistered label queues jobs indefinitely with zero eligible runners — the exact #599/#576 `docker`-label failure mode. Lane registration is a GO-gated live-fleet mutation (publish-lane-ensure.sh ALLOW_FLEET_MUTATION=1, requires explicit Hongming in-chat GO). Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitea/workflows/publish-canvas-image.yml | 18 +++++++++++------- .gitea/workflows/publish-runtime.yml | 9 +++++++-- .../publish-workspace-server-image.yml | 13 +++++++++++-- .gitea/workflows/redeploy-tenants-on-main.yml | 5 ++++- .../workflows/redeploy-tenants-on-staging.yml | 5 ++++- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index 9aedadd6..818a4cad 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -49,13 +49,17 @@ jobs: # bp-exempt: post-merge image publication side effect; CI / all-required gates source changes. build-and-push: name: Build & push canvas image - # REVERTED (infra/revert-docker-runner-label): `runs-on: ubuntu-latest` restored. - # The `docker` label is not registered on any act_runner. `runs-on: [ubuntu-latest, docker]` - # causes jobs to queue indefinitely with zero eligible runners — strictly worse than the - # pre-#599 coin-flip (50% success rate). Once the `docker` label is registered on - # ≥2 runners, re-apply the fix from #599 (infra/docker-runner-label). - # See issue #576 + infra-lead pulse ~00:30Z. - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push:main, canvas/**) — reserved capacity so a merged + # canvas fix's image build never FIFO-queues behind PR required-CI. + # The `publish` label resolves ONLY to the molecule-runner-publish-* + # sub-pool (config.publish.yaml). HARD DEPENDENCY: this MUST land + # AFTER the publish-lane runners are registered/advertising `publish` + # — the earlier #599 `docker` label attempt queued indefinitely with + # zero eligible runners precisely because the label was targeted + # before any runner advertised it (see #576). The lane is registered + # in this rollout (internal#462) so the precondition holds. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/publish-runtime.yml b/.gitea/workflows/publish-runtime.yml index fe46e812..c96307ab 100644 --- a/.gitea/workflows/publish-runtime.yml +++ b/.gitea/workflows/publish-runtime.yml @@ -66,7 +66,10 @@ concurrency: jobs: publish: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push tag runtime-v*) — reserved capacity, never FIFO + # behind PR-CI. `publish` resolves only to molecule-runner-publish-*. + runs-on: publish outputs: version: ${{ steps.version.outputs.version }} wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }} @@ -166,7 +169,9 @@ jobs: cascade: needs: publish - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — downstream of the runtime + # publish ship job; keep it on the reserved lane too. + runs-on: publish steps: - name: Wait for PyPI to propagate the new version env: diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 02a42962..3f70ca2b 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -54,7 +54,14 @@ env: jobs: build-and-push: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). This + # is a post-merge ship job (on: push:main) — it must NOT FIFO-compete + # with PR required-CI on the shared pool (PR#1350's prod image build + # was delayed ~25min this way). The `publish` label resolves ONLY to + # the reserved molecule-runner-publish-* sub-pool (config.publish.yaml, + # OUTSIDE the managed 1..20 range) so a merged fix's image build + # starts immediately while PR-CI keeps the general pool. + runs-on: publish steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -181,7 +188,9 @@ jobs: name: Production auto-deploy needs: build-and-push if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — production deploy of a merged + # fix; reserved capacity, never queued behind PR-CI. + runs-on: publish timeout-minutes: 75 env: CP_URL: ${{ vars.PROD_CP_URL || 'https://api.moleculesai.app' }} diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 259df556..f458501c 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -68,7 +68,10 @@ jobs: # bp-exempt: production redeploy is a side-effect workflow, not a merge gate. redeploy: if: ${{ github.event_name == 'workflow_dispatch' }} - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Production tenant redeploy — a deploy action, reserved capacity so + # it never queues behind PR-CI. `publish` -> molecule-runner-publish-*. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml index 98f6b227..534a977e 100644 --- a/.gitea/workflows/redeploy-tenants-on-staging.yml +++ b/.gitea/workflows/redeploy-tenants-on-staging.yml @@ -75,7 +75,10 @@ env: jobs: # bp-exempt: post-merge staging redeploy side effect; CI / all-required gates source changes. redeploy: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Post-merge staging redeploy — a deploy action, reserved capacity. + # `publish` -> molecule-runner-publish-* sub-pool. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true -- 2.45.2