forked from molecule-ai/molecule-core
Merge pull request #2648 from Molecule-AI/fix/sweep-stale-e2e-tighter-threshold
ci: tighten e2e cleanup race window 120m → ~45m worst case
This commit is contained in:
commit
98da627170
28
.github/workflows/canary-staging.yml
vendored
28
.github/workflows/canary-staging.yml
vendored
@ -231,10 +231,34 @@ jobs:
|
|||||||
and o.get('status') not in ('purged',)]
|
and o.get('status') not in ('purged',)]
|
||||||
print('\n'.join(candidates))
|
print('\n'.join(candidates))
|
||||||
" 2>/dev/null)
|
" 2>/dev/null)
|
||||||
|
# Per-slug DELETE with HTTP-code verification. The previous
|
||||||
|
# `... >/dev/null || true` swallowed every failure, so a 5xx
|
||||||
|
# or timeout from CP looked identical to "successfully cleaned
|
||||||
|
# up" and the tenant kept eating ~2 vCPU until the hourly
|
||||||
|
# stale sweep caught it (up to 2h later). Now we capture the
|
||||||
|
# response code and surface non-2xx as a workflow warning, so
|
||||||
|
# the run page shows which slug leaked. We still don't `exit 1`
|
||||||
|
# on cleanup failure — a single-canary cleanup miss shouldn't
|
||||||
|
# fail-flag the canary itself when the actual smoke check
|
||||||
|
# passed. The sweep-stale-e2e-orgs cron (now every 15 min,
|
||||||
|
# 30-min threshold) is the safety net for whatever slips past.
|
||||||
|
# See molecule-controlplane#420.
|
||||||
|
leaks=()
|
||||||
for slug in $orgs; do
|
for slug in $orgs; do
|
||||||
curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
code=$(curl -sS -o /tmp/canary-cleanup.out -w "%{http_code}" \
|
||||||
|
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d "{\"confirm\":\"$slug\"}" >/dev/null || true
|
-d "{\"confirm\":\"$slug\"}" \
|
||||||
|
|| echo "000")
|
||||||
|
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||||
|
echo "[teardown] deleted $slug (HTTP $code)"
|
||||||
|
else
|
||||||
|
echo "::warning::canary teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canary-cleanup.out 2>/dev/null)"
|
||||||
|
leaks+=("$slug")
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
if [ ${#leaks[@]} -gt 0 ]; then
|
||||||
|
echo "::warning::canary teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||||
|
fi
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
21
.github/workflows/sweep-stale-e2e-orgs.yml
vendored
21
.github/workflows/sweep-stale-e2e-orgs.yml
vendored
@ -25,16 +25,23 @@ name: Sweep stale e2e-* orgs (staging)
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
# Every hour on the hour. E2E orgs are short-lived (~10-25 min wall
|
# Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from
|
||||||
# clock from create to teardown). Anything older than the
|
# create to teardown — canary is ~8 min, full SaaS ~25 min). The
|
||||||
# MAX_AGE_MINUTES threshold below is presumed dead.
|
# previous hourly + 120-min stale threshold meant a leaked tenant
|
||||||
- cron: '0 * * * *'
|
# could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per
|
||||||
|
# leak. Tightening the cadence + threshold reduces the worst-case
|
||||||
|
# leak window from 120 min to ~45 min (15-min sweep cadence + 30-min
|
||||||
|
# threshold) without risk of catching in-progress runs (the longest
|
||||||
|
# e2e run is the 25-min canary, well under the 30-min threshold).
|
||||||
|
# See molecule-controlplane#420 for the leak-class accounting that
|
||||||
|
# motivated this tightening.
|
||||||
|
- cron: '*/15 * * * *'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
max_age_minutes:
|
max_age_minutes:
|
||||||
description: "Delete e2e-* orgs older than N minutes (default 120)"
|
description: "Delete e2e-* orgs older than N minutes (default 30)"
|
||||||
required: false
|
required: false
|
||||||
default: "120"
|
default: "30"
|
||||||
dry_run:
|
dry_run:
|
||||||
description: "Dry run only — list what would be deleted"
|
description: "Dry run only — list what would be deleted"
|
||||||
required: false
|
required: false
|
||||||
@ -58,7 +65,7 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||||
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }}
|
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }}
|
||||||
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
|
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
|
||||||
# Refuse to delete more than this many orgs in one tick. If the
|
# Refuse to delete more than this many orgs in one tick. If the
|
||||||
# CP DB is briefly empty (or the admin endpoint goes weird and
|
# CP DB is briefly empty (or the admin endpoint goes weird and
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user