2026-06-15 07:03:07 +00:00
2 changed files with 262 additions and 0 deletions
@@ -102,6 +102,22 @@ jobs:
          cache: false
          cache-dependency-path: workspace-server/go.sum

+      - name: Snapshot pre-existing ws-* containers (run-scoped teardown baseline)
+        run: |
+          # #2883: this job provisions ws-<uuid> workspace containers via the
+          # platform binary's docker.sock. The e2e script's own EXIT trap deletes
+          # the workspace it created, but a CANCELLED / TIMED-OUT job (act_runner
+          # SIGKILLs the job container, so the bash trap never fires) or a
+          # platform crash mid-provision leaks the ws-<uuid> container — it then
+          # runs forever and pegs CPU on the shared docker-host runner (13 orphans
+          # found on ded-1, 11+3 on the prod robots). Record the ws-* containers
+          # that already exist BEFORE this job touches anything, so the always()
+          # teardown below removes ONLY the ones this run created and never
+          # disrupts a concurrent run's in-flight workspace on the shared host.
+          docker ps -aq --filter "name=^ws-" > /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
+          echo "ws-* baseline ($(wc -l < /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null | tr -d ' ') pre-existing):"
+          cat /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
+
      - name: Build e2e-names SSOT CLI
        working-directory: workspace-server
        run: go build -o /usr/local/bin/e2e-names ./cmd/e2e-names
@@ -301,6 +317,34 @@ jobs:
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true

+      - name: Teardown leaked ws-* workspace containers (run-scoped, #2883)
+        if: always()
+        run: |
+          # #2883: remove ONLY the ws-<uuid> workspace containers this run
+          # created (everything matching name ^ws- that was NOT in the
+          # pre-job baseline). The e2e script's trap usually handles its own
+          # workspace, but a cancelled/timed-out job kills the runner container
+          # before the trap fires, and a platform crash can orphan the
+          # container — both leak a forever-running ws-* that pegs CPU on the
+          # shared docker-host. Run-scoped (diff against baseline) so a
+          # concurrent run's in-flight workspace on the same host is untouched.
+          # The standing sweep-stale-ws-orphans.yml janitor is the age-guarded
+          # belt-and-braces second layer for anything this step misses
+          # (e.g. the runner container itself being SIGKILLed before this runs).
+          BASELINE="/tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt"
+          [ -f "$BASELINE" ] || : > "$BASELINE"
+          removed=0
+          for cid in $(docker ps -aq --filter "name=^ws-" 2>/dev/null); do
+            # Skip containers that already existed before this job started.
+            if grep -qx "$cid" "$BASELINE" 2>/dev/null; then continue; fi
+            name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid")
+            echo "removing leaked workspace container: $name ($cid)"
+            docker rm -f "$cid" >/dev/null 2>&1 || true
+            removed=$((removed + 1))
+          done
+          rm -f "$BASELINE" 2>/dev/null || true
+          echo "ws-* teardown removed $removed run-scoped orphan container(s)."
+
  # ===========================================================================
  # ADVISORY — real claude-code image, lifecycle-only. Non-blocking. It pulls/
  # builds the 2.5GB template image, makes a real (cheap) MiniMax LLM call, and is
@@ -354,6 +398,22 @@ jobs:
          cache: false
          cache-dependency-path: workspace-server/go.sum

+      - name: Snapshot pre-existing ws-* containers (run-scoped teardown baseline)
+        run: |
+          # #2883: this job provisions ws-<uuid> workspace containers via the
+          # platform binary's docker.sock. The e2e script's own EXIT trap deletes
+          # the workspace it created, but a CANCELLED / TIMED-OUT job (act_runner
+          # SIGKILLs the job container, so the bash trap never fires) or a
+          # platform crash mid-provision leaks the ws-<uuid> container — it then
+          # runs forever and pegs CPU on the shared docker-host runner (13 orphans
+          # found on ded-1, 11+3 on the prod robots). Record the ws-* containers
+          # that already exist BEFORE this job touches anything, so the always()
+          # teardown below removes ONLY the ones this run created and never
+          # disrupts a concurrent run's in-flight workspace on the shared host.
+          docker ps -aq --filter "name=^ws-" > /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
+          echo "ws-* baseline ($(wc -l < /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null | tr -d ' ') pre-existing):"
+          cat /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
+
      - name: Build e2e-names SSOT CLI
        working-directory: workspace-server
        run: go build -o /usr/local/bin/e2e-names ./cmd/e2e-names
@@ -540,3 +600,31 @@ jobs:
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
+
+      - name: Teardown leaked ws-* workspace containers (run-scoped, #2883)
+        if: always()
+        run: |
+          # #2883: remove ONLY the ws-<uuid> workspace containers this run
+          # created (everything matching name ^ws- that was NOT in the
+          # pre-job baseline). The e2e script's trap usually handles its own
+          # workspace, but a cancelled/timed-out job kills the runner container
+          # before the trap fires, and a platform crash can orphan the
+          # container — both leak a forever-running ws-* that pegs CPU on the
+          # shared docker-host. Run-scoped (diff against baseline) so a
+          # concurrent run's in-flight workspace on the same host is untouched.
+          # The standing sweep-stale-ws-orphans.yml janitor is the age-guarded
+          # belt-and-braces second layer for anything this step misses
+          # (e.g. the runner container itself being SIGKILLed before this runs).
+          BASELINE="/tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt"
+          [ -f "$BASELINE" ] || : > "$BASELINE"
+          removed=0
+          for cid in $(docker ps -aq --filter "name=^ws-" 2>/dev/null); do
+            # Skip containers that already existed before this job started.
+            if grep -qx "$cid" "$BASELINE" 2>/dev/null; then continue; fi
+            name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid")
+            echo "removing leaked workspace container: $name ($cid)"
+            docker rm -f "$cid" >/dev/null 2>&1 || true
+            removed=$((removed + 1))
+          done
+          rm -f "$BASELINE" 2>/dev/null || true
+          echo "ws-* teardown removed $removed run-scoped orphan container(s)."
@@ -0,0 +1,174 @@
+name: Sweep stale ws-* workspace orphans (runner host)
+
+# Standing janitor for ws-<uuid> workspace containers (and their
+# molecule-local/* images' running containers) left behind on the docker-host
+# CI runners by the local-provision E2E lane (local-provision-e2e.yml). This is
+# the belt-and-braces SECOND layer behind that workflow's own run-scoped
+# always() teardown — see #2883.
+#
+# Why a standing sweeper in addition to the per-run teardown
+# ---------------------------------------------------------
+# Per-run teardown is best-effort by definition (mirrors the rationale in
+# sweep-stale-e2e-orgs.yml). The local-provision lane provisions ws-<uuid>
+# SIBLING containers from a HOST platform binary via docker.sock. Two failure
+# modes leak a container the in-workflow teardown cannot reliably catch:
+#   * The job is CANCELLED / TIMED-OUT — act_runner SIGKILLs the job container,
+#     so neither the e2e script's bash EXIT trap NOR the workflow's own
+#     always() teardown step is guaranteed to run to completion.
+#   * The platform-server crashes mid-provision and orphans a half-started
+#     ws-<uuid> container whose owning workspace row never reached a state the
+#     script's delete path covers.
+# An accumulation of these forever-running containers pegs CPU and exhausts the
+# shared docker-host runner — the likely root cause of the advisory-lane
+# intermittent reds (#2693 / #2680 / #2739). 13 such orphans were found on the
+# retired ded-1 box (2-3 days old) and 11+3 on the production robots.
+#
+# Why a separate workflow vs the CP/AWS sweepers
+#   * sweep-stale-e2e-orgs / sweep-cf-* / sweep-aws-secrets operate at the
+#     control-plane / cloud-API layer (DELETE /cp/admin/..., CF zone, ASM).
+#     This leak is purely LOCAL docker state on the runner host — there is no
+#     CP org row or cloud resource to drive a cascade from. So this sweeper
+#     enumerates the runner's local docker daemon directly.
+#
+# Age-filter so in-flight runs are NEVER touched: only containers OLDER than
+# WS_MAX_AGE_HOURS are removed. The longest local-provision run is ~30 min
+# (lifecycle-real timeout-minutes: 30), so a 2-hour floor is well clear of any
+# legitimately-running workspace.
+#
+# SUBSTRATE: runs on `docker-host` (the operator-host molecule-runner-* lane)
+# because that is where the ws-<uuid> containers are provisioned and where the
+# molecule-core-net bridge + docker.sock live (same constraint as
+# local-provision-e2e.yml + handlers-postgres-integration.yml). A bare
+# ubuntu-latest run on a Windows act_runner would inspect the WRONG daemon.
+
+on:
+  schedule:
+    # Hourly. The 2-hour age floor (WS_MAX_AGE_HOURS) is the actual safety
+    # margin against catching an in-flight run; hourly cadence keeps the
+    # worst-case orphan lifetime to ~3h instead of multi-day.
+    - cron: '17 * * * *'
+  workflow_dispatch:
+
+# Don't let two sweeps fight over the same daemon. Cron + a manual dispatch
+# could overlap; queue rather than parallel-delete.
+concurrency:
+  group: sweep-stale-ws-orphans
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+env:
+  GITHUB_SERVER_URL: https://git.moleculesai.app
+
+jobs:
+  sweep:
+    name: Sweep ws-* orphans
+    # Pin to docker-host: the orphans live on the operator-host runner daemon
+    # (where local-provision-e2e.yml provisions them). A ubuntu-latest Windows
+    # act_runner would target the wrong docker daemon. See SUBSTRATE above.
+    runs-on: docker-host
+    # Critical janitor — fail loud, don't mask. A silently-failing sweeper is
+    # how the orphans accumulated in the first place (mirrors the
+    # continue-on-error removal rationale in sweep-stale-e2e-orgs.yml).
+    continue-on-error: false
+    timeout-minutes: 10
+    env:
+      # Containers younger than this are assumed to belong to an in-flight run
+      # and are left alone. The longest local-provision job is ~30 min, so 2h
+      # is a wide safety margin. Overridable via repo/org variable if a future
+      # lane runs longer.
+      WS_MAX_AGE_HOURS: '2'
+      # Refuse to remove more than this many containers in one tick. If the
+      # daemon enumeration goes weird (e.g. clock skew makes everything look
+      # old), bail rather than nuke a surprising number of containers.
+      SAFETY_CAP: '100'
+      # Dry-run escape hatch for manual investigation (set via env edit + a
+      # workflow_dispatch run). Default false = actually remove.
+      DRY_RUN: 'false'
+
+    steps:
+      - name: Sweep stale ws-* / molecule-local workspace orphans
+        run: |
+          set -uo pipefail
+          if ! docker info >/dev/null 2>&1; then
+            echo "::error::docker daemon not reachable on this runner — cannot sweep ws-* orphans."
+            exit 2
+          fi
+
+          NOW=$(date +%s)
+          MAX_AGE_SECONDS=$(( WS_MAX_AGE_HOURS * 3600 ))
+          echo "Sweeping ws-* / molecule-local workspace containers older than ${WS_MAX_AGE_HOURS}h (${MAX_AGE_SECONDS}s). DRY_RUN=${DRY_RUN}"
+
+          # Enumerate candidate workspace containers two ways and de-dup:
+          #   1. name ^ws-<uuid>           (the provisioner's container name)
+          #   2. ancestor image molecule-local/* (the local workspace-template
+          #      + stub-runtime images this lane builds/tags) — catches a
+          #      container that (somehow) isn't named ws-* but is clearly ours.
+          candidates=$(
+            { docker ps -aq --filter "name=^ws-" 2>/dev/null
+              docker ps -aq --filter "ancestor=molecule-local/stub-runtime" 2>/dev/null
+              for img in $(docker images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep '^molecule-local/' || true); do
+                docker ps -aq --filter "ancestor=$img" 2>/dev/null || true
+              done
+            } | sort -u
+          )
+
+          if [ -z "$candidates" ]; then
+            echo "No ws-* / molecule-local workspace containers present. Clean."
+            exit 0
+          fi
+
+          # Build the stale list (older than the age floor) first, so the
+          # safety cap can gate the actual removal.
+          stale=""
+          stale_n=0
+          for cid in $candidates; do
+            started=$(docker inspect -f '{{.State.StartedAt}}' "$cid" 2>/dev/null || true)
+            created=$(docker inspect -f '{{.Created}}' "$cid" 2>/dev/null || true)
+            ref="$started"
+            # A never-started (created-only) container has a zero StartedAt;
+            # fall back to Created so we still age it out.
+            case "$ref" in ''|0001-01-01*) ref="$created" ;; esac
+            [ -z "$ref" ] && continue
+            # GNU date on the Linux docker-host runner parses RFC3339 directly.
+            ts=$(date -d "$ref" +%s 2>/dev/null || echo 0)
+            [ "$ts" -eq 0 ] && continue
+            age=$(( NOW - ts ))
+            if [ "$age" -ge "$MAX_AGE_SECONDS" ]; then
+              name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid")
+              echo "  STALE (${age}s old): $name ($cid)"
+              stale="$stale $cid"
+              stale_n=$((stale_n + 1))
+            fi
+          done
+
+          if [ "$stale_n" -eq 0 ]; then
+            echo "Found $(echo "$candidates" | wc -w | tr -d ' ') workspace container(s) but none older than ${WS_MAX_AGE_HOURS}h. Nothing to remove."
+            exit 0
+          fi
+
+          if [ "$stale_n" -gt "$SAFETY_CAP" ]; then
+            echo "::error::Refusing to remove $stale_n containers in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means clock skew or a daemon-enumeration anomaly made everything look stale."
+            exit 1
+          fi
+
+          if [ "$DRY_RUN" = "true" ]; then
+            echo "DRY RUN — would remove $stale_n stale workspace container(s). Set DRY_RUN=false to actually remove."
+            exit 0
+          fi
+
+          removed=0
+          for cid in $stale; do
+            docker rm -f "$cid" >/dev/null 2>&1 && removed=$((removed + 1)) || echo "  WARN: could not remove $cid (already gone?)"
+          done
+          echo "Sweep summary: removed=$removed of $stale_n stale workspace container(s)."
+
+      - name: Notify on sweep failure
+        # Fail-loud companion (mirrors sweep-stale-e2e-orgs.yml). A silently
+        # red sweeper is exactly how the orphans accumulated; tag the failure
+        # so the runs UI + any log-tail consumer flags it.
+        if: failure()
+        run: |
+          echo "::error::sweep-stale-ws-orphans FAILED — leaked ws-* workspace containers may be accumulating on a docker-host runner and pegging CPU (#2883). Check this runner's docker daemon manually (docker ps --filter name=^ws-) and prior step logs. Common causes: (a) docker daemon unreachable on the runner, (b) safety-cap tripped (clock skew / enumeration anomaly)."
+          exit 1