fix(e2e): teardown leaked ws- workspace containers + standing orphan-sweeper (#2883) #2885

Merged
devops-engineer merged 1 commits from fix/e2e-ws-teardown into main 2026-06-15 07:03:07 +00:00
2 changed files with 262 additions and 0 deletions
+88
View File
@@ -102,6 +102,22 @@ jobs:
cache: false
cache-dependency-path: workspace-server/go.sum
- name: Snapshot pre-existing ws-* containers (run-scoped teardown baseline)
run: |
# #2883: this job provisions ws-<uuid> workspace containers via the
# platform binary's docker.sock. The e2e script's own EXIT trap deletes
# the workspace it created, but a CANCELLED / TIMED-OUT job (act_runner
# SIGKILLs the job container, so the bash trap never fires) or a
# platform crash mid-provision leaks the ws-<uuid> container — it then
# runs forever and pegs CPU on the shared docker-host runner (13 orphans
# found on ded-1, 11+3 on the prod robots). Record the ws-* containers
# that already exist BEFORE this job touches anything, so the always()
# teardown below removes ONLY the ones this run created and never
# disrupts a concurrent run's in-flight workspace on the shared host.
docker ps -aq --filter "name=^ws-" > /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
echo "ws-* baseline ($(wc -l < /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null | tr -d ' ') pre-existing):"
cat /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
- name: Build e2e-names SSOT CLI
working-directory: workspace-server
run: go build -o /usr/local/bin/e2e-names ./cmd/e2e-names
@@ -301,6 +317,34 @@ jobs:
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
- name: Teardown leaked ws-* workspace containers (run-scoped, #2883)
if: always()
run: |
# #2883: remove ONLY the ws-<uuid> workspace containers this run
# created (everything matching name ^ws- that was NOT in the
# pre-job baseline). The e2e script's trap usually handles its own
# workspace, but a cancelled/timed-out job kills the runner container
# before the trap fires, and a platform crash can orphan the
# container — both leak a forever-running ws-* that pegs CPU on the
# shared docker-host. Run-scoped (diff against baseline) so a
# concurrent run's in-flight workspace on the same host is untouched.
# The standing sweep-stale-ws-orphans.yml janitor is the age-guarded
# belt-and-braces second layer for anything this step misses
# (e.g. the runner container itself being SIGKILLed before this runs).
BASELINE="/tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt"
[ -f "$BASELINE" ] || : > "$BASELINE"
removed=0
for cid in $(docker ps -aq --filter "name=^ws-" 2>/dev/null); do
# Skip containers that already existed before this job started.
if grep -qx "$cid" "$BASELINE" 2>/dev/null; then continue; fi
name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid")
echo "removing leaked workspace container: $name ($cid)"
docker rm -f "$cid" >/dev/null 2>&1 || true
removed=$((removed + 1))
done
rm -f "$BASELINE" 2>/dev/null || true
echo "ws-* teardown removed $removed run-scoped orphan container(s)."
# ===========================================================================
# ADVISORY — real claude-code image, lifecycle-only. Non-blocking. It pulls/
# builds the 2.5GB template image, makes a real (cheap) MiniMax LLM call, and is
@@ -354,6 +398,22 @@ jobs:
cache: false
cache-dependency-path: workspace-server/go.sum
- name: Snapshot pre-existing ws-* containers (run-scoped teardown baseline)
run: |
# #2883: this job provisions ws-<uuid> workspace containers via the
# platform binary's docker.sock. The e2e script's own EXIT trap deletes
# the workspace it created, but a CANCELLED / TIMED-OUT job (act_runner
# SIGKILLs the job container, so the bash trap never fires) or a
# platform crash mid-provision leaks the ws-<uuid> container — it then
# runs forever and pegs CPU on the shared docker-host runner (13 orphans
# found on ded-1, 11+3 on the prod robots). Record the ws-* containers
# that already exist BEFORE this job touches anything, so the always()
# teardown below removes ONLY the ones this run created and never
# disrupts a concurrent run's in-flight workspace on the shared host.
docker ps -aq --filter "name=^ws-" > /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
echo "ws-* baseline ($(wc -l < /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null | tr -d ' ') pre-existing):"
cat /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true
- name: Build e2e-names SSOT CLI
working-directory: workspace-server
run: go build -o /usr/local/bin/e2e-names ./cmd/e2e-names
@@ -540,3 +600,31 @@ jobs:
run: |
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
- name: Teardown leaked ws-* workspace containers (run-scoped, #2883)
if: always()
run: |
# #2883: remove ONLY the ws-<uuid> workspace containers this run
# created (everything matching name ^ws- that was NOT in the
# pre-job baseline). The e2e script's trap usually handles its own
# workspace, but a cancelled/timed-out job kills the runner container
# before the trap fires, and a platform crash can orphan the
# container — both leak a forever-running ws-* that pegs CPU on the
# shared docker-host. Run-scoped (diff against baseline) so a
# concurrent run's in-flight workspace on the same host is untouched.
# The standing sweep-stale-ws-orphans.yml janitor is the age-guarded
# belt-and-braces second layer for anything this step misses
# (e.g. the runner container itself being SIGKILLed before this runs).
BASELINE="/tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt"
[ -f "$BASELINE" ] || : > "$BASELINE"
removed=0
for cid in $(docker ps -aq --filter "name=^ws-" 2>/dev/null); do
# Skip containers that already existed before this job started.
if grep -qx "$cid" "$BASELINE" 2>/dev/null; then continue; fi
name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid")
echo "removing leaked workspace container: $name ($cid)"
docker rm -f "$cid" >/dev/null 2>&1 || true
removed=$((removed + 1))
done
rm -f "$BASELINE" 2>/dev/null || true
echo "ws-* teardown removed $removed run-scoped orphan container(s)."
+174
View File
@@ -0,0 +1,174 @@
name: Sweep stale ws-* workspace orphans (runner host)
# Standing janitor for ws-<uuid> workspace containers (and their
# molecule-local/* images' running containers) left behind on the docker-host
# CI runners by the local-provision E2E lane (local-provision-e2e.yml). This is
# the belt-and-braces SECOND layer behind that workflow's own run-scoped
# always() teardown — see #2883.
#
# Why a standing sweeper in addition to the per-run teardown
# ---------------------------------------------------------
# Per-run teardown is best-effort by definition (mirrors the rationale in
# sweep-stale-e2e-orgs.yml). The local-provision lane provisions ws-<uuid>
# SIBLING containers from a HOST platform binary via docker.sock. Two failure
# modes leak a container the in-workflow teardown cannot reliably catch:
# * The job is CANCELLED / TIMED-OUT — act_runner SIGKILLs the job container,
# so neither the e2e script's bash EXIT trap NOR the workflow's own
# always() teardown step is guaranteed to run to completion.
# * The platform-server crashes mid-provision and orphans a half-started
# ws-<uuid> container whose owning workspace row never reached a state the
# script's delete path covers.
# An accumulation of these forever-running containers pegs CPU and exhausts the
# shared docker-host runner — the likely root cause of the advisory-lane
# intermittent reds (#2693 / #2680 / #2739). 13 such orphans were found on the
# retired ded-1 box (2-3 days old) and 11+3 on the production robots.
#
# Why a separate workflow vs the CP/AWS sweepers
# * sweep-stale-e2e-orgs / sweep-cf-* / sweep-aws-secrets operate at the
# control-plane / cloud-API layer (DELETE /cp/admin/..., CF zone, ASM).
# This leak is purely LOCAL docker state on the runner host — there is no
# CP org row or cloud resource to drive a cascade from. So this sweeper
# enumerates the runner's local docker daemon directly.
#
# Age-filter so in-flight runs are NEVER touched: only containers OLDER than
# WS_MAX_AGE_HOURS are removed. The longest local-provision run is ~30 min
# (lifecycle-real timeout-minutes: 30), so a 2-hour floor is well clear of any
# legitimately-running workspace.
#
# SUBSTRATE: runs on `docker-host` (the operator-host molecule-runner-* lane)
# because that is where the ws-<uuid> containers are provisioned and where the
# molecule-core-net bridge + docker.sock live (same constraint as
# local-provision-e2e.yml + handlers-postgres-integration.yml). A bare
# ubuntu-latest run on a Windows act_runner would inspect the WRONG daemon.
on:
schedule:
# Hourly. The 2-hour age floor (WS_MAX_AGE_HOURS) is the actual safety
# margin against catching an in-flight run; hourly cadence keeps the
# worst-case orphan lifetime to ~3h instead of multi-day.
- cron: '17 * * * *'
workflow_dispatch:
# Don't let two sweeps fight over the same daemon. Cron + a manual dispatch
# could overlap; queue rather than parallel-delete.
concurrency:
group: sweep-stale-ws-orphans
cancel-in-progress: false
permissions:
contents: read
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
sweep:
name: Sweep ws-* orphans
# Pin to docker-host: the orphans live on the operator-host runner daemon
# (where local-provision-e2e.yml provisions them). A ubuntu-latest Windows
# act_runner would target the wrong docker daemon. See SUBSTRATE above.
runs-on: docker-host
# Critical janitor — fail loud, don't mask. A silently-failing sweeper is
# how the orphans accumulated in the first place (mirrors the
# continue-on-error removal rationale in sweep-stale-e2e-orgs.yml).
continue-on-error: false
timeout-minutes: 10
env:
# Containers younger than this are assumed to belong to an in-flight run
# and are left alone. The longest local-provision job is ~30 min, so 2h
# is a wide safety margin. Overridable via repo/org variable if a future
# lane runs longer.
WS_MAX_AGE_HOURS: '2'
# Refuse to remove more than this many containers in one tick. If the
# daemon enumeration goes weird (e.g. clock skew makes everything look
# old), bail rather than nuke a surprising number of containers.
SAFETY_CAP: '100'
# Dry-run escape hatch for manual investigation (set via env edit + a
# workflow_dispatch run). Default false = actually remove.
DRY_RUN: 'false'
steps:
- name: Sweep stale ws-* / molecule-local workspace orphans
run: |
set -uo pipefail
if ! docker info >/dev/null 2>&1; then
echo "::error::docker daemon not reachable on this runner — cannot sweep ws-* orphans."
exit 2
fi
NOW=$(date +%s)
MAX_AGE_SECONDS=$(( WS_MAX_AGE_HOURS * 3600 ))
echo "Sweeping ws-* / molecule-local workspace containers older than ${WS_MAX_AGE_HOURS}h (${MAX_AGE_SECONDS}s). DRY_RUN=${DRY_RUN}"
# Enumerate candidate workspace containers two ways and de-dup:
# 1. name ^ws-<uuid> (the provisioner's container name)
# 2. ancestor image molecule-local/* (the local workspace-template
# + stub-runtime images this lane builds/tags) — catches a
# container that (somehow) isn't named ws-* but is clearly ours.
candidates=$(
{ docker ps -aq --filter "name=^ws-" 2>/dev/null
docker ps -aq --filter "ancestor=molecule-local/stub-runtime" 2>/dev/null
for img in $(docker images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep '^molecule-local/' || true); do
docker ps -aq --filter "ancestor=$img" 2>/dev/null || true
done
} | sort -u
)
if [ -z "$candidates" ]; then
echo "No ws-* / molecule-local workspace containers present. Clean."
exit 0
fi
# Build the stale list (older than the age floor) first, so the
# safety cap can gate the actual removal.
stale=""
stale_n=0
for cid in $candidates; do
started=$(docker inspect -f '{{.State.StartedAt}}' "$cid" 2>/dev/null || true)
created=$(docker inspect -f '{{.Created}}' "$cid" 2>/dev/null || true)
ref="$started"
# A never-started (created-only) container has a zero StartedAt;
# fall back to Created so we still age it out.
case "$ref" in ''|0001-01-01*) ref="$created" ;; esac
[ -z "$ref" ] && continue
# GNU date on the Linux docker-host runner parses RFC3339 directly.
ts=$(date -d "$ref" +%s 2>/dev/null || echo 0)
[ "$ts" -eq 0 ] && continue
age=$(( NOW - ts ))
if [ "$age" -ge "$MAX_AGE_SECONDS" ]; then
name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid")
echo " STALE (${age}s old): $name ($cid)"
stale="$stale $cid"
stale_n=$((stale_n + 1))
fi
done
if [ "$stale_n" -eq 0 ]; then
echo "Found $(echo "$candidates" | wc -w | tr -d ' ') workspace container(s) but none older than ${WS_MAX_AGE_HOURS}h. Nothing to remove."
exit 0
fi
if [ "$stale_n" -gt "$SAFETY_CAP" ]; then
echo "::error::Refusing to remove $stale_n containers in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means clock skew or a daemon-enumeration anomaly made everything look stale."
exit 1
fi
if [ "$DRY_RUN" = "true" ]; then
echo "DRY RUN — would remove $stale_n stale workspace container(s). Set DRY_RUN=false to actually remove."
exit 0
fi
removed=0
for cid in $stale; do
docker rm -f "$cid" >/dev/null 2>&1 && removed=$((removed + 1)) || echo " WARN: could not remove $cid (already gone?)"
done
echo "Sweep summary: removed=$removed of $stale_n stale workspace container(s)."
- name: Notify on sweep failure
# Fail-loud companion (mirrors sweep-stale-e2e-orgs.yml). A silently
# red sweeper is exactly how the orphans accumulated; tag the failure
# so the runs UI + any log-tail consumer flags it.
if: failure()
run: |
echo "::error::sweep-stale-ws-orphans FAILED — leaked ws-* workspace containers may be accumulating on a docker-host runner and pegging CPU (#2883). Check this runner's docker daemon manually (docker ps --filter name=^ws-) and prior step logs. Common causes: (a) docker daemon unreachable on the runner, (b) safety-cap tripped (clock skew / enumeration anomaly)."
exit 1