diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index dc48a42d..fc05605a 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -102,6 +102,22 @@ jobs: cache: false cache-dependency-path: workspace-server/go.sum + - name: Snapshot pre-existing ws-* containers (run-scoped teardown baseline) + run: | + # #2883: this job provisions ws- workspace containers via the + # platform binary's docker.sock. The e2e script's own EXIT trap deletes + # the workspace it created, but a CANCELLED / TIMED-OUT job (act_runner + # SIGKILLs the job container, so the bash trap never fires) or a + # platform crash mid-provision leaks the ws- container — it then + # runs forever and pegs CPU on the shared docker-host runner (13 orphans + # found on ded-1, 11+3 on the prod robots). Record the ws-* containers + # that already exist BEFORE this job touches anything, so the always() + # teardown below removes ONLY the ones this run created and never + # disrupts a concurrent run's in-flight workspace on the shared host. + docker ps -aq --filter "name=^ws-" > /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true + echo "ws-* baseline ($(wc -l < /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null | tr -d ' ') pre-existing):" + cat /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true + - name: Build e2e-names SSOT CLI working-directory: workspace-server run: go build -o /usr/local/bin/e2e-names ./cmd/e2e-names @@ -301,6 +317,34 @@ jobs: docker rm -f "$PG_CONTAINER" 2>/dev/null || true docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true + - name: Teardown leaked ws-* workspace containers (run-scoped, #2883) + if: always() + run: | + # #2883: remove ONLY the ws- workspace containers this run + # created (everything matching name ^ws- that was NOT in the + # pre-job baseline). The e2e script's trap usually handles its own + # workspace, but a cancelled/timed-out job kills the runner container + # before the trap fires, and a platform crash can orphan the + # container — both leak a forever-running ws-* that pegs CPU on the + # shared docker-host. Run-scoped (diff against baseline) so a + # concurrent run's in-flight workspace on the same host is untouched. + # The standing sweep-stale-ws-orphans.yml janitor is the age-guarded + # belt-and-braces second layer for anything this step misses + # (e.g. the runner container itself being SIGKILLed before this runs). + BASELINE="/tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt" + [ -f "$BASELINE" ] || : > "$BASELINE" + removed=0 + for cid in $(docker ps -aq --filter "name=^ws-" 2>/dev/null); do + # Skip containers that already existed before this job started. + if grep -qx "$cid" "$BASELINE" 2>/dev/null; then continue; fi + name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid") + echo "removing leaked workspace container: $name ($cid)" + docker rm -f "$cid" >/dev/null 2>&1 || true + removed=$((removed + 1)) + done + rm -f "$BASELINE" 2>/dev/null || true + echo "ws-* teardown removed $removed run-scoped orphan container(s)." + # =========================================================================== # ADVISORY — real claude-code image, lifecycle-only. Non-blocking. It pulls/ # builds the 2.5GB template image, makes a real (cheap) MiniMax LLM call, and is @@ -354,6 +398,22 @@ jobs: cache: false cache-dependency-path: workspace-server/go.sum + - name: Snapshot pre-existing ws-* containers (run-scoped teardown baseline) + run: | + # #2883: this job provisions ws- workspace containers via the + # platform binary's docker.sock. The e2e script's own EXIT trap deletes + # the workspace it created, but a CANCELLED / TIMED-OUT job (act_runner + # SIGKILLs the job container, so the bash trap never fires) or a + # platform crash mid-provision leaks the ws- container — it then + # runs forever and pegs CPU on the shared docker-host runner (13 orphans + # found on ded-1, 11+3 on the prod robots). Record the ws-* containers + # that already exist BEFORE this job touches anything, so the always() + # teardown below removes ONLY the ones this run created and never + # disrupts a concurrent run's in-flight workspace on the shared host. + docker ps -aq --filter "name=^ws-" > /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true + echo "ws-* baseline ($(wc -l < /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null | tr -d ' ') pre-existing):" + cat /tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt 2>/dev/null || true + - name: Build e2e-names SSOT CLI working-directory: workspace-server run: go build -o /usr/local/bin/e2e-names ./cmd/e2e-names @@ -540,3 +600,31 @@ jobs: run: | docker rm -f "$PG_CONTAINER" 2>/dev/null || true docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true + + - name: Teardown leaked ws-* workspace containers (run-scoped, #2883) + if: always() + run: | + # #2883: remove ONLY the ws- workspace containers this run + # created (everything matching name ^ws- that was NOT in the + # pre-job baseline). The e2e script's trap usually handles its own + # workspace, but a cancelled/timed-out job kills the runner container + # before the trap fires, and a platform crash can orphan the + # container — both leak a forever-running ws-* that pegs CPU on the + # shared docker-host. Run-scoped (diff against baseline) so a + # concurrent run's in-flight workspace on the same host is untouched. + # The standing sweep-stale-ws-orphans.yml janitor is the age-guarded + # belt-and-braces second layer for anything this step misses + # (e.g. the runner container itself being SIGKILLed before this runs). + BASELINE="/tmp/ws_baseline_${{ github.run_id }}_${{ github.run_attempt }}.txt" + [ -f "$BASELINE" ] || : > "$BASELINE" + removed=0 + for cid in $(docker ps -aq --filter "name=^ws-" 2>/dev/null); do + # Skip containers that already existed before this job started. + if grep -qx "$cid" "$BASELINE" 2>/dev/null; then continue; fi + name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid") + echo "removing leaked workspace container: $name ($cid)" + docker rm -f "$cid" >/dev/null 2>&1 || true + removed=$((removed + 1)) + done + rm -f "$BASELINE" 2>/dev/null || true + echo "ws-* teardown removed $removed run-scoped orphan container(s)." diff --git a/.gitea/workflows/sweep-stale-ws-orphans.yml b/.gitea/workflows/sweep-stale-ws-orphans.yml new file mode 100644 index 00000000..3e7910a3 --- /dev/null +++ b/.gitea/workflows/sweep-stale-ws-orphans.yml @@ -0,0 +1,174 @@ +name: Sweep stale ws-* workspace orphans (runner host) + +# Standing janitor for ws- workspace containers (and their +# molecule-local/* images' running containers) left behind on the docker-host +# CI runners by the local-provision E2E lane (local-provision-e2e.yml). This is +# the belt-and-braces SECOND layer behind that workflow's own run-scoped +# always() teardown — see #2883. +# +# Why a standing sweeper in addition to the per-run teardown +# --------------------------------------------------------- +# Per-run teardown is best-effort by definition (mirrors the rationale in +# sweep-stale-e2e-orgs.yml). The local-provision lane provisions ws- +# SIBLING containers from a HOST platform binary via docker.sock. Two failure +# modes leak a container the in-workflow teardown cannot reliably catch: +# * The job is CANCELLED / TIMED-OUT — act_runner SIGKILLs the job container, +# so neither the e2e script's bash EXIT trap NOR the workflow's own +# always() teardown step is guaranteed to run to completion. +# * The platform-server crashes mid-provision and orphans a half-started +# ws- container whose owning workspace row never reached a state the +# script's delete path covers. +# An accumulation of these forever-running containers pegs CPU and exhausts the +# shared docker-host runner — the likely root cause of the advisory-lane +# intermittent reds (#2693 / #2680 / #2739). 13 such orphans were found on the +# retired ded-1 box (2-3 days old) and 11+3 on the production robots. +# +# Why a separate workflow vs the CP/AWS sweepers +# * sweep-stale-e2e-orgs / sweep-cf-* / sweep-aws-secrets operate at the +# control-plane / cloud-API layer (DELETE /cp/admin/..., CF zone, ASM). +# This leak is purely LOCAL docker state on the runner host — there is no +# CP org row or cloud resource to drive a cascade from. So this sweeper +# enumerates the runner's local docker daemon directly. +# +# Age-filter so in-flight runs are NEVER touched: only containers OLDER than +# WS_MAX_AGE_HOURS are removed. The longest local-provision run is ~30 min +# (lifecycle-real timeout-minutes: 30), so a 2-hour floor is well clear of any +# legitimately-running workspace. +# +# SUBSTRATE: runs on `docker-host` (the operator-host molecule-runner-* lane) +# because that is where the ws- containers are provisioned and where the +# molecule-core-net bridge + docker.sock live (same constraint as +# local-provision-e2e.yml + handlers-postgres-integration.yml). A bare +# ubuntu-latest run on a Windows act_runner would inspect the WRONG daemon. + +on: + schedule: + # Hourly. The 2-hour age floor (WS_MAX_AGE_HOURS) is the actual safety + # margin against catching an in-flight run; hourly cadence keeps the + # worst-case orphan lifetime to ~3h instead of multi-day. + - cron: '17 * * * *' + workflow_dispatch: + +# Don't let two sweeps fight over the same daemon. Cron + a manual dispatch +# could overlap; queue rather than parallel-delete. +concurrency: + group: sweep-stale-ws-orphans + cancel-in-progress: false + +permissions: + contents: read + +env: + GITHUB_SERVER_URL: https://git.moleculesai.app + +jobs: + sweep: + name: Sweep ws-* orphans + # Pin to docker-host: the orphans live on the operator-host runner daemon + # (where local-provision-e2e.yml provisions them). A ubuntu-latest Windows + # act_runner would target the wrong docker daemon. See SUBSTRATE above. + runs-on: docker-host + # Critical janitor — fail loud, don't mask. A silently-failing sweeper is + # how the orphans accumulated in the first place (mirrors the + # continue-on-error removal rationale in sweep-stale-e2e-orgs.yml). + continue-on-error: false + timeout-minutes: 10 + env: + # Containers younger than this are assumed to belong to an in-flight run + # and are left alone. The longest local-provision job is ~30 min, so 2h + # is a wide safety margin. Overridable via repo/org variable if a future + # lane runs longer. + WS_MAX_AGE_HOURS: '2' + # Refuse to remove more than this many containers in one tick. If the + # daemon enumeration goes weird (e.g. clock skew makes everything look + # old), bail rather than nuke a surprising number of containers. + SAFETY_CAP: '100' + # Dry-run escape hatch for manual investigation (set via env edit + a + # workflow_dispatch run). Default false = actually remove. + DRY_RUN: 'false' + + steps: + - name: Sweep stale ws-* / molecule-local workspace orphans + run: | + set -uo pipefail + if ! docker info >/dev/null 2>&1; then + echo "::error::docker daemon not reachable on this runner — cannot sweep ws-* orphans." + exit 2 + fi + + NOW=$(date +%s) + MAX_AGE_SECONDS=$(( WS_MAX_AGE_HOURS * 3600 )) + echo "Sweeping ws-* / molecule-local workspace containers older than ${WS_MAX_AGE_HOURS}h (${MAX_AGE_SECONDS}s). DRY_RUN=${DRY_RUN}" + + # Enumerate candidate workspace containers two ways and de-dup: + # 1. name ^ws- (the provisioner's container name) + # 2. ancestor image molecule-local/* (the local workspace-template + # + stub-runtime images this lane builds/tags) — catches a + # container that (somehow) isn't named ws-* but is clearly ours. + candidates=$( + { docker ps -aq --filter "name=^ws-" 2>/dev/null + docker ps -aq --filter "ancestor=molecule-local/stub-runtime" 2>/dev/null + for img in $(docker images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep '^molecule-local/' || true); do + docker ps -aq --filter "ancestor=$img" 2>/dev/null || true + done + } | sort -u + ) + + if [ -z "$candidates" ]; then + echo "No ws-* / molecule-local workspace containers present. Clean." + exit 0 + fi + + # Build the stale list (older than the age floor) first, so the + # safety cap can gate the actual removal. + stale="" + stale_n=0 + for cid in $candidates; do + started=$(docker inspect -f '{{.State.StartedAt}}' "$cid" 2>/dev/null || true) + created=$(docker inspect -f '{{.Created}}' "$cid" 2>/dev/null || true) + ref="$started" + # A never-started (created-only) container has a zero StartedAt; + # fall back to Created so we still age it out. + case "$ref" in ''|0001-01-01*) ref="$created" ;; esac + [ -z "$ref" ] && continue + # GNU date on the Linux docker-host runner parses RFC3339 directly. + ts=$(date -d "$ref" +%s 2>/dev/null || echo 0) + [ "$ts" -eq 0 ] && continue + age=$(( NOW - ts )) + if [ "$age" -ge "$MAX_AGE_SECONDS" ]; then + name=$(docker inspect -f '{{.Name}}' "$cid" 2>/dev/null | sed 's#^/##' || echo "$cid") + echo " STALE (${age}s old): $name ($cid)" + stale="$stale $cid" + stale_n=$((stale_n + 1)) + fi + done + + if [ "$stale_n" -eq 0 ]; then + echo "Found $(echo "$candidates" | wc -w | tr -d ' ') workspace container(s) but none older than ${WS_MAX_AGE_HOURS}h. Nothing to remove." + exit 0 + fi + + if [ "$stale_n" -gt "$SAFETY_CAP" ]; then + echo "::error::Refusing to remove $stale_n containers in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means clock skew or a daemon-enumeration anomaly made everything look stale." + exit 1 + fi + + if [ "$DRY_RUN" = "true" ]; then + echo "DRY RUN — would remove $stale_n stale workspace container(s). Set DRY_RUN=false to actually remove." + exit 0 + fi + + removed=0 + for cid in $stale; do + docker rm -f "$cid" >/dev/null 2>&1 && removed=$((removed + 1)) || echo " WARN: could not remove $cid (already gone?)" + done + echo "Sweep summary: removed=$removed of $stale_n stale workspace container(s)." + + - name: Notify on sweep failure + # Fail-loud companion (mirrors sweep-stale-e2e-orgs.yml). A silently + # red sweeper is exactly how the orphans accumulated; tag the failure + # so the runs UI + any log-tail consumer flags it. + if: failure() + run: | + echo "::error::sweep-stale-ws-orphans FAILED — leaked ws-* workspace containers may be accumulating on a docker-host runner and pegging CPU (#2883). Check this runner's docker daemon manually (docker ps --filter name=^ws-) and prior step logs. Common causes: (a) docker daemon unreachable on the runner, (b) safety-cap tripped (clock skew / enumeration anomaly)." + exit 1