molecule-core/.github/workflows/sweep-stale-e2e-orgs.yml

name: Sweep stale e2e-* orgs (staging)

# Janitor for staging tenants left behind when E2E cleanup didn't run:
# CI cancellations, runner crashes, transient AWS errors mid-cascade,
# bash trap missed (signal 9), etc. Without this loop, every failed
# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
#
# Why not rely on per-test-run teardown:
#   - Per-run teardown is best-effort by definition. Any process death
#     after the test starts but before the trap fires leaves debris.
#   - GH Actions cancellation kills the runner without grace period.
#     The workflow's `if: always()` step usually catches this, but it
#     too can fail (CP transient 5xx, runner network issue at the
#     wrong moment).
#   - Even when teardown runs, the CP cascade is best-effort in places
#     (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
#   - This sweep is the catch-all that converges staging back to clean
#     regardless of which specific path leaked.
#
# The PROPER fix is making CP cleanup transactional + verify-after-
# terminate (filed separately as cleanup-correctness work). This
# workflow is the safety net that catches everything else AND any
# future leak source we haven't yet identified.

on:
  schedule:
    # Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from
    # create to teardown — canary is ~8 min, full SaaS ~25 min). The
    # previous hourly + 120-min stale threshold meant a leaked tenant
    # could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per
    # leak. Tightening the cadence + threshold reduces the worst-case
    # leak window from 120 min to ~45 min (15-min sweep cadence + 30-min
    # threshold) without risk of catching in-progress runs (the longest
    # e2e run is the 25-min canary, well under the 30-min threshold).
    # See molecule-controlplane#420 for the leak-class accounting that
    # motivated this tightening.
    - cron: '*/15 * * * *'
  workflow_dispatch:
    inputs:
      max_age_minutes:
        description: "Delete e2e-* orgs older than N minutes (default 30)"
        required: false
        default: "30"
      dry_run:
        description: "Dry run only — list what would be deleted"
        required: false
        type: boolean
        default: false

# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
# on a manual trigger; queue rather than parallel-delete.
concurrency:
  group: sweep-stale-e2e-orgs
  cancel-in-progress: false

permissions:
  contents: read

jobs:
  sweep:
    name: Sweep e2e orgs
    runs-on: ubuntu-latest
    timeout-minutes: 15
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
      MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }}
      DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
      # Refuse to delete more than this many orgs in one tick. If the
      # CP DB is briefly empty (or the admin endpoint goes weird and
      # returns no created_at), every e2e- org would look stale.
      # Bailing protects against runaway nukes.
      SAFETY_CAP: 50

    steps:
      - name: Verify admin token present
        run: |
          if [ -z "$ADMIN_TOKEN" ]; then
            echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
            exit 2
          fi
          echo "Admin token present ✓"

      - name: Identify stale e2e orgs
        id: identify
        run: |
          set -euo pipefail
          # Fetch into a file so the python step reads it via stdin —
          # cleaner than embedding $(curl ...) into a heredoc.
          curl -sS --fail-with-body --max-time 30 \
            "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
            -H "Authorization: Bearer $ADMIN_TOKEN" \
            > orgs.json

          # Filter:
          #   1. slug starts with one of the ephemeral test prefixes:
          #        - 'e2e-'    — covers e2e-canary-, e2e-canvas-*, etc.
          #        - 'rt-e2e-' — runtime-test harness fixtures (RFC #2251);
          #                      missing this prefix left two such tenants
          #                      orphaned 8h on staging (2026-05-03), then
          #                      hard-failed redeploy-tenants-on-staging
          #                      and broke the staging→main auto-promote
          #                      chain. Kept in sync with the EPHEMERAL_PREFIX_RE
          #                      regex in redeploy-tenants-on-staging.yml.
          #   2. created_at is older than MAX_AGE_MINUTES ago
          # Output one slug per line to a file the next step reads.
          python3 > stale_slugs.txt <<'PY'
          import json, os
          from datetime import datetime, timezone, timedelta
          # SSOT for this list lives in the controlplane Go code:
          # molecule-controlplane/internal/slugs/ephemeral.go
          # (var EphemeralPrefixes). The redeploy-fleet auto-rollout
          # also reads from there to SKIP these slugs — without that
          # filter, fleet redeploy SSM-failed in-flight E2E tenants
          # whose containers were still booting, breaking the test
          # that just spun them up (molecule-controlplane#493).
          # Update both files together.
          EPHEMERAL_PREFIXES = ("e2e-", "rt-e2e-")
          with open("orgs.json") as f:
              data = json.load(f)
          max_age = int(os.environ["MAX_AGE_MINUTES"])
          cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
          for o in data.get("orgs", []):
              slug = o.get("slug", "")
              if not slug.startswith(EPHEMERAL_PREFIXES):
                  continue
              created = o.get("created_at")
              if not created:
                  # Defensively skip rows without created_at — better
                  # to leave one orphan than nuke a brand-new row
                  # whose timestamp didn't render.
                  continue
              # Python 3.11+ handles RFC3339 with Z directly via
              # fromisoformat; older runners need the trailing Z swap.
              created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
              if created_dt < cutoff:
                  print(slug)
          PY

          count=$(wc -l < stale_slugs.txt | tr -d ' ')
          echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
          if [ "$count" -gt 0 ]; then
            echo "First 20:"
            head -20 stale_slugs.txt | sed 's/^/  /'
          fi
          echo "count=$count" >> "$GITHUB_OUTPUT"

      - name: Safety gate
        if: steps.identify.outputs.count != '0'
        run: |
          count="${{ steps.identify.outputs.count }}"
          if [ "$count" -gt "$SAFETY_CAP" ]; then
            echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
            exit 1
          fi
          echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"

      - name: Delete stale orgs
        if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
        run: |
          set -uo pipefail
          deleted=0
          failed=0
          while IFS= read -r slug; do
            [ -z "$slug" ] && continue
            # The DELETE handler requires {"confirm": "<slug>"} matching
            # the URL slug — fat-finger guard. Idempotent: re-issuing
            # picks up via org_purges.last_step.
            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
            # pollution of the captured status (lint-curl-status-capture.yml).
            set +e
            curl -sS -o /tmp/del_resp -w "%{http_code}" \
              --max-time 60 \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/del_code
            set -e
            # Stderr from curl (-sS shows dial errors etc.) goes to runner log.
            http_code=$(cat /tmp/del_code 2>/dev/null || echo "000")
            if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
              deleted=$((deleted+1))
              echo "  deleted: $slug"
            else
              failed=$((failed+1))
              echo "  FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
            fi
          done < stale_slugs.txt
          echo ""
          echo "Sweep summary: deleted=$deleted failed=$failed"
          # Don't fail the workflow on per-org delete errors — the
          # sweeper is best-effort. Next hourly tick re-attempts. We
          # only fail loud at the safety-cap gate above.

      - name: Sweep orphan tunnels
        # Stale-org cleanup deletes the org (which cascades to tunnel
        # delete inside the CP). But when that cascade fails partway —
        # CP transient 5xx after the org row is deleted but before the
        # CF tunnel delete completes — the tunnel persists with no
        # matching org row. The reconciler in internal/sweep flags this
        # as `cf_tunnel kind=orphan`, but nothing automatically reaps it.
        #
        # `/cp/admin/orphan-tunnels/cleanup` is the operator-triggered
        # reaper. Calling it here at the end of every sweep tick
        # converges the staging CF account to clean even when CP
        # cascades half-fail.
        #
        # PR #492 made the underlying DeleteTunnel actually check
        # status — pre-fix it silent-succeeded on CF code 1022
        # ("active connections"), so this step would have been a no-op
        # against stuck connectors. Post-fix the cleanup invokes
        # CleanupTunnelConnections + retry, which actually clears the
        # 1022 case. (#2987)
        #
        # Best-effort. Failure here doesn't fail the workflow — next
        # tick re-attempts. Errors flow to step output for ops review.
        if: env.DRY_RUN != 'true'
        run: |
          set +e
          curl -sS -o /tmp/cleanup_resp -w "%{http_code}" \
            --max-time 60 \
            -X POST "$MOLECULE_CP_URL/cp/admin/orphan-tunnels/cleanup" \
            -H "Authorization: Bearer $ADMIN_TOKEN" >/tmp/cleanup_code
          set -e
          http_code=$(cat /tmp/cleanup_code 2>/dev/null || echo "000")
          body=$(cat /tmp/cleanup_resp 2>/dev/null | head -c 500)
          if [ "$http_code" = "200" ]; then
            count=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(d.get('deleted_count', 0))" 2>/dev/null || echo "0")
            failed_n=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(len(d.get('failed') or {}))" 2>/dev/null || echo "0")
            echo "Orphan-tunnel sweep: deleted=$count failed=$failed_n"
          else
            echo "::warning::orphan-tunnels cleanup returned HTTP $http_code — body: $body"
          fi

      - name: Dry-run summary
        if: env.DRY_RUN == 'true'
        run: |
          echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete."