ci(publish-workspace-server-image): auto-redeploy staging fleet on every main merge #2940

Merged
devops-engineer merged 1 commits from fix/auto-redeploy-staging-on-main into main 2026-06-15 13:57:37 +00:00
@@ -313,6 +313,149 @@ jobs:
fi
done
# Staging auto-deploy: every workspace-server image publish on main should
# roll out to the staging fleet so code fixes reach staging without a
# manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so
# the redeploy lives in this workflow as a dependent job (guaranteed to run
# after the image is published). Closes the deploy-lag where merged fixes
# built a new image but never reached staging tenants
# (Researcher RCA #2929 comment 103252).
deploy-staging:
name: Staging auto-deploy
needs: build-and-push
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
# Side-effect deploy; image publish success is the durable artifact.
continue-on-error: true
# Publish/release lane (internal#462) — same reserved capacity as prod.
runs-on: publish
timeout-minutes: 25
env:
CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
steps:
- name: Wait for ECR tag propagation
# ECR replication can lag ~15-30s after the push; mirrors the GHCR
# propagation sleep in redeploy-tenants-on-staging.yml.
run: sleep 30
- name: Call staging-CP redeploy-fleet
run: |
set -euo pipefail
if [ -z "${CP_STAGING_ADMIN_API_TOKEN:-}" ]; then
echo "::error::staging redeploy cannot run — CP_STAGING_ADMIN_API_TOKEN secret missing"
echo "::error::set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
exit 1
fi
BODY=$(jq -nc \
--arg tag "staging-latest" \
--arg canary "" \
--argjson soak 60 \
--argjson batch 3 \
--argjson dry false \
'{
target_tag: $tag,
canary_slug: $canary,
soak_seconds: $soak,
batch_size: $batch,
dry_run: $dry,
confirm: true
}')
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " body: $BODY"
HTTP_RESPONSE=$(mktemp)
HTTP_CODE_FILE=$(mktemp)
# Route -w into its own tempfile so curl's exit code (e.g. 56 on
# connection-reset) cannot pollute the captured stdout. Same fix
# shape as redeploy-tenants-on-staging.yml.
set +e
curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" >"$HTTP_CODE_FILE"
set -e
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
echo "HTTP $HTTP_CODE"
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
{
echo "## Staging tenant redeploy fleet"
echo ""
echo "**Target tag:** \`staging-latest\`"
echo "**HTTP:** $HTTP_CODE"
echo ""
echo "### Per-tenant result"
echo ""
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
echo '|------|-------|------------|------|---------|-------|'
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
} >> "$GITHUB_STEP_SUMMARY"
OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE")
if [ "$HTTP_CODE" != "200" ] || [ "$OK" != "true" ]; then
echo "::error::redeploy-fleet reported failure (HTTP $HTTP_CODE ok=$OK)"
exit 1
fi
cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
- name: Verify each staging tenant /buildinfo matches published SHA
env:
EXPECTED_SHA: ${{ github.sha }}
TENANT_DOMAIN: 'staging.moleculesai.app'
run: |
set -euo pipefail
RESP="$RUNNER_TEMP/redeploy-response.json"
if [ ! -s "$RESP" ]; then
echo "::error::redeploy-response.json missing or empty"
exit 1
fi
mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
if [ ${#SLUGS[@]} -eq 0 ]; then
echo "::warning::No staging tenants reported healthz_ok — nothing to verify"
exit 0
fi
echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
SETTLE_BUDGET_SECONDS="240"
SETTLE_INTERVAL_SECONDS="20"
STALE_COUNT=0
UNREACHABLE_COUNT=0
for slug in "${SLUGS[@]}"; do
url="https://${slug}.${TENANT_DOMAIN}/buildinfo"
deadline=$(( $(date +%s) + SETTLE_BUDGET_SECONDS ))
actual=""
last_actual=""
on_target=false
while :; do
body="$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$url" || true)"
actual="$(echo "$body" | jq -r '.git_sha // ""' 2>/dev/null || echo "")"
[ -n "$actual" ] && last_actual="$actual"
if [ "$actual" = "$EXPECTED_SHA" ]; then
on_target=true
break
fi
now=$(date +%s)
if [ "$now" -ge "$deadline" ]; then
break
fi
remaining=$(( deadline - now ))
echo "$slug: waiting for target SHA (have '${actual:0:7}', want ${EXPECTED_SHA:0:7}; ${remaining}s left)"
sleep "$SETTLE_INTERVAL_SECONDS"
done
if [ "$on_target" = true ]; then
echo "$slug: ${actual:0:7}"
elif [ -z "$last_actual" ]; then
echo "::error::$slug did not return /buildinfo after deploy (waited ${SETTLE_BUDGET_SECONDS}s)."
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
else
echo "::error::$slug is stale: actual=${last_actual:0:7}, expected=${EXPECTED_SHA:0:7} (waited ${SETTLE_BUDGET_SECONDS}s)"
STALE_COUNT=$((STALE_COUNT + 1))
fi
done
if [ "$STALE_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then
echo "::error::Staging verify failed — stale=$STALE_COUNT unreachable=$UNREACHABLE_COUNT"
exit 1
fi
# bp-exempt: production deploy side-effect; merge is gated by CI / all-required and this job waits for push CI before acting.
deploy-production:
name: Production auto-deploy