From 35f5b91f5db46f87411b3d2a6ca4f5ee10ed537d Mon Sep 17 00:00:00 2001 From: core-devops Date: Tue, 9 Jun 2026 08:54:46 -0700 Subject: [PATCH] fix(ci): self-heal e2e-chat testcontainer leaks (pre-run sweep + timeout cleanup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit E2E Chat starts per-run `pg-/redis-e2e-chat--` containers and already has an `if: always()` "Stop service containers" step — but it still leaks: a cancelled/killed run never runs always(), and `docker rm -f … || true` silently swallows a failure when the (shared, overloaded) operator daemon wedges the removal. Result: 13 such containers found running 12 days–2 weeks on the operator, all from failed/cancelled runs — feeding the daemon-churn that wedges buildkit (controlplane#646). Durable fix = make leaks self-heal instead of depending on every run's own cleanup: - New pre-run "Sweep stale e2e-chat testcontainers" step reaps any e2e-chat container older than 2h (>> the 15m job), so each run reaps predecessors' leaks regardless of why they leaked. Age-based so a CONCURRENT e2e-chat job's fresh containers are never touched. - Wrap the always() cleanup rms in `timeout 30` so a wedged daemon can't hang the cleanup step (a hung rm is itself a leak source). Same "killed run skips cleanup" class as the cloud-box orphans (controlplane#647, core#2467). No test-logic change. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/e2e-chat.yml | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/e2e-chat.yml b/.gitea/workflows/e2e-chat.yml index cd29f0fa1..cbccb1179 100644 --- a/.gitea/workflows/e2e-chat.yml +++ b/.gitea/workflows/e2e-chat.yml @@ -165,6 +165,28 @@ jobs: cache: 'npm' cache-dependency-path: canvas/package-lock.json + - name: Sweep stale e2e-chat testcontainers (self-heal prior leaks) + if: needs.detect-changes.outputs.chat == 'true' + run: | + # Prior e2e-chat runs that were cancelled/killed — or whose always() + # cleanup hit a wedged docker daemon — leak their pg-/redis-e2e-chat-* + # containers, which then pile up on the shared runner host (observed: 13 + # such containers, up to 2 weeks old, on the operator daemon). Reap any + # e2e-chat container older than the job window so leaks self-heal every + # run instead of relying on each run's own cleanup succeeding. Age-based + # (>2h, well beyond the 15m job) so a CONCURRENT e2e-chat job's fresh + # containers are never touched. See controlplane#646. + now=$(date -u +%s) + docker ps -a --filter name=e2e-chat --format '{{.Names}}' | while read -r c; do + [ -n "$c" ] || continue + created=$(docker inspect -f '{{.Created}}' "$c" 2>/dev/null) || continue + cts=$(date -u -d "$created" +%s 2>/dev/null) || continue + if [ $(( now - cts )) -gt 7200 ]; then + echo "sweeping stale e2e-chat container $c (created $created)" + timeout 30 docker rm -f "$c" >/dev/null 2>&1 || true + fi + done + - name: Start Postgres (docker) if: needs.detect-changes.outputs.chat == 'true' run: | @@ -430,5 +452,7 @@ jobs: - name: Stop service containers if: always() && needs.detect-changes.outputs.chat == 'true' run: | - docker rm -f "$PG_CONTAINER" 2>/dev/null || true - docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true + # timeout-wrap so a wedged docker daemon can't hang this always() step + # (a hung rm here is one way containers leak in the first place). + timeout 30 docker rm -f "$PG_CONTAINER" 2>/dev/null || true + timeout 30 docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true -- 2.52.0