From 17f1f30b3ffbc3f587ca01eed1d02aae0d438e63 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 7 May 2026 13:04:57 -0700 Subject: [PATCH 1/2] fix(test): drain coalesceRestart goroutines before t.Cleanup (Class H, #170) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TestPooledWithEICTunnel_PreservesFnErr (and any sqlmock-using neighbour test) was at risk of inheriting stale INSERT calls from a previous test's coalesceRestart goroutine that survived its t.Cleanup boundary. The production callsite shape is `go h.RestartByID(...)` from a2a_proxy.go, a2a_proxy_helpers.go and main.go. When that goroutine's runRestartCycle panics, coalesceRestart's deferred recover swallows it to keep the platform process alive — but in tests, nothing waits for the goroutine to fully exit. If it's still draining LogActivity-shaped work after the test returns, those INSERTs land in the next test's sqlmock connection as kind=DELEGATION_FAILED / kind=WORKSPACE_PROVISION_FAILED, surfacing as "INSERT-not-expected". Fix: introduce drainCoalesceGoroutine(t, wsID, cycle) test helper that spawns coalesceRestart on a goroutine (matching production) and registers a t.Cleanup with sync.WaitGroup.Wait so the test can't declare itself done while a goroutine is still alive. Convert TestCoalesceRestart_PanicInCycleClearsState to use the helper (previously it called coalesceRestart synchronously, which never exercised the production goroutine-survival contract). Add TestCoalesceRestart_DrainHelperWaitsForGoroutineExit as the regression guard: cycle blocks 150ms then panics; the test asserts t.Run elapsed >= 150ms (proving the Wait barrier engaged) AND the deferred close ran (proving the panic-recovery defer chain executed) AND state.running was cleared. Verified the assertion is real by mutation-testing: removing t.Cleanup(wg.Wait) makes this test FAIL deterministically with elapsed <300µs. Per saved memory feedback_assert_exact_not_substring: the regression test asserts an exact-shape contract (elapsed >= blockFor) rather than a substring-in-output, so it discriminates between "drain works" and "drain skipped". Per Phase 3: 10/10 race-detector runs pass for all TestCoalesceRestart_* tests. Full ./internal/handlers/... suite green. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workspace_restart_coalesce_test.go | 173 +++++++++++++++++- 1 file changed, 164 insertions(+), 9 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_restart_coalesce_test.go b/workspace-server/internal/handlers/workspace_restart_coalesce_test.go index 39e14219..e21147c5 100644 --- a/workspace-server/internal/handlers/workspace_restart_coalesce_test.go +++ b/workspace-server/internal/handlers/workspace_restart_coalesce_test.go @@ -1,6 +1,7 @@ package handlers import ( + "runtime" "sync" "sync/atomic" "testing" @@ -15,6 +16,42 @@ func resetRestartStatesFor(workspaceID string) { restartStates.Delete(workspaceID) } +// drainCoalesceGoroutine spawns `coalesceRestart(wsID, cycle)` on a +// goroutine that mirrors the real production caller shape +// (`go h.RestartByID(...)` from a2a_proxy.go, a2a_proxy_helpers.go, +// main.go), and registers a t.Cleanup that blocks until the goroutine +// has TERMINATED — not just panicked-and-recovered, fully exited. +// +// This is the bleed-prevention contract for Class H (Task #170): no +// test in this file may declare itself complete while a coalesceRestart +// goroutine it spawned is still alive, because that goroutine could +// otherwise wake up after the test's sqlmock has been closed and +// either: +// - issue a stale INSERT that gets attributed to the next test's +// sqlmock connection — surfaces as +// "INSERT-not-expected for kind=DELEGATION_FAILED" / =WORKSPACE_PROVISION_FAILED +// in a neighbour test that doesn't itself touch coalesceRestart; or +// - hold a reference to the closed *sql.DB and panic on the next op. +// +// Implementation notes: +// - sync.WaitGroup must be Add()ed BEFORE the goroutine is spawned; +// Add inside the goroutine races with Wait. +// - t.Cleanup runs in LIFO order, so this composes safely with other +// cleanups (e.g. setupTestDB's mockDB.Close). +// - We don't bound the Wait with a timeout — if the goroutine +// genuinely deadlocks, the whole test process should hang and fail +// under -timeout. A timeout-then-orphan would mask the bleed. +func drainCoalesceGoroutine(t *testing.T, wsID string, cycle func()) { + t.Helper() + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + coalesceRestart(wsID, cycle) + }() + t.Cleanup(wg.Wait) +} + // TestCoalesceRestart_SingleCallRunsOneCycle is the baseline: // no concurrency, one cycle. If this fails the gate logic is broken at // its simplest path. @@ -200,19 +237,45 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) { const wsID = "test-coalesce-panic-recovery" resetRestartStatesFor(wsID) - // First call's cycle panics. coalesceRestart's defer must swallow - // the panic so this test caller doesn't see it propagate up — that - // matches what the real production caller (`go h.RestartByID(...)`) - // gets: the goroutine survives, no process crash. - defer func() { - if r := recover(); r != nil { - t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r) + // Spawn the panicking cycle on a goroutine via drainCoalesceGoroutine + // — this mirrors the real production callsite shape + // (`go h.RestartByID(...)` from a2a_proxy.go:584, + // a2a_proxy_helpers.go:197, main.go:213). The previous form called + // coalesceRestart synchronously, which neither exercised the + // goroutine-survival contract nor caught Class H bleed regressions + // where the panic-recovery goroutine outlives the test and pollutes + // the next test's sqlmock with INSERTs from runRestartCycle's + // LogActivity calls (kinds DELEGATION_FAILED / WORKSPACE_PROVISION_FAILED). + // + // drainCoalesceGoroutine registers a t.Cleanup that Wait()s for the + // goroutine to TERMINATE — not merely panic-and-recover — before + // the test ends. + drainCoalesceGoroutine(t, wsID, func() { panic("simulated cycle failure") }) + + // We need a mid-test barrier (not just the t.Cleanup-time barrier) + // so the second coalesceRestart below sees state.running=false. The + // goroutine clears state.running inside its deferred recover; poll + // the package-level restartStates map until that observable flip + // happens. Bound at 2s — longer = real bug. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + sv, ok := restartStates.Load(wsID) + if ok { + st := sv.(*restartState) + st.mu.Lock() + running := st.running + st.mu.Unlock() + if !running { + break + } } - }() - coalesceRestart(wsID, func() { panic("simulated cycle failure") }) + time.Sleep(time.Millisecond) + } // Second call must run a fresh cycle. If running stayed true after // the panic, this call would early-return without invoking cycle. + // Synchronous — no panic, so no goroutine to drain, and we want to + // assert ran.Load() immediately after. var ran atomic.Bool coalesceRestart(wsID, func() { ran.Store(true) }) if !ran.Load() { @@ -220,6 +283,98 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) { } } +// TestCoalesceRestart_DrainHelperWaitsForGoroutineExit is the Class H +// regression guard for Task #170. It asserts the contract enforced by +// drainCoalesceGoroutine: t.Cleanup blocks until the spawned +// coalesceRestart goroutine has FULLY EXITED — not merely recovered +// from panic. This is the contract that prevents stale LogActivity +// INSERTs from a recovering goroutine bleeding into the next test's +// sqlmock (the failure mode reported as "INSERT-not-expected for +// kind=DELEGATION_FAILED" in TestPooledWithEICTunnel_PreservesFnErr). +// +// We use a deterministic bleed-shape probe rather than goroutine-count +// arithmetic: the cycle blocks on a release channel for ~150ms — long +// enough that without a Wait barrier, the outer sub-test would return +// before the goroutine exited. We then verify the wg.Wait inside +// drainCoalesceGoroutine actually delayed t.Run's completion: total +// elapsed must be >= the block duration. Asserts exact-shape, not +// substring (per saved-memory feedback_assert_exact_not_substring): +// elapsed < blockFor would mean the cleanup didn't wait, which is the +// exact bleed we're guarding against. +// +// We additionally panic from the cycle (after the block) to confirm +// the helper waits past panic recovery, not just past cycle return. +func TestCoalesceRestart_DrainHelperWaitsForGoroutineExit(t *testing.T) { + const blockFor = 150 * time.Millisecond + const wsID = "test-coalesce-drain-helper-contract" + resetRestartStatesFor(wsID) + + // done is closed inside the cycle, AFTER the block + AFTER the + // panic (which the deferred recover in coalesceRestart catches). + // Actually: defer in cycle runs before panic propagates to the + // outer recover. Use defer to close. + exited := make(chan struct{}) + + subStart := time.Now() + t.Run("drain_under_subtest", func(st *testing.T) { + drainCoalesceGoroutine(st, wsID, func() { + defer close(exited) + time.Sleep(blockFor) + panic("contract-test panic-after-block") + }) + // st.Cleanup runs here, before t.Run returns. wg.Wait must + // block until the goroutine has finished its panic recovery. + }) + subElapsed := time.Since(subStart) + + // Contract: the helper's wg.Wait MUST have blocked t.Run from + // returning until after the cycle's block + panic recovery. + if subElapsed < blockFor { + t.Fatalf( + "drainCoalesceGoroutine contract violated: t.Run returned in %v, "+ + "but cycle blocks for %v. The Wait barrier is broken — a "+ + "coalesceRestart goroutine can outlive its test's t.Cleanup "+ + "and pollute neighbour-test sqlmock state (Class H bleed).", + subElapsed, blockFor, + ) + } + + // And the goroutine must have actually closed `exited` (i.e. ran + // the deferred close before panic propagated through coalesceRestart's + // recover). If exited is still open here, the goroutine never + // reached the close — meaning either the panic short-circuited the + // defer (Go runtime bug — won't happen) or the goroutine never + // ran at all (drainCoalesceGoroutine spawn shape regressed). + select { + case <-exited: + // Correct path. + default: + t.Fatal("cycle goroutine never reached its deferred close — panic-recovery contract regressed") + } + + // Belt-and-suspenders: the post-recover state-clear must have + // flipped state.running back to false. If this fails, the panic + // path skipped the deferred state-clear in coalesceRestart. + sv, ok := restartStates.Load(wsID) + if !ok { + t.Fatal("restartStates entry missing for wsID after cycle — sync.Map regression") + } + st := sv.(*restartState) + st.mu.Lock() + running := st.running + st.mu.Unlock() + if running { + t.Error("state.running was not cleared after panic — sticky-running deadlock regressed") + } + + // Reference runtime.NumGoroutine to keep the runtime import + // honest — also a useful smoke check that the goroutine count + // hasn't ballooned 10x while debugging this test. + if n := runtime.NumGoroutine(); n > 200 { + t.Logf("warning: NumGoroutine=%d after drain — high but not necessarily a leak", n) + } +} + // TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the // per-workspace state map: an in-flight restart for ws A must not // block restarts for ws B. Important for performance — without this, -- 2.45.2 From e16d7eaa08ab567f1131d19ff056793b72b968cc Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Thu, 7 May 2026 13:08:36 -0700 Subject: [PATCH 2/2] fix(ci): apply pre-clone fix to platform Dockerfile too (followup #173) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first PR (#38) only patched Dockerfile.tenant — but the workflow also builds the platform image from workspace-server/Dockerfile, which had the SAME in-image `git clone` stage. Build run #794 caught this: "process clone-manifest.sh ... exit code 128" on the platform image. Apply the same pre-clone shape to the platform Dockerfile: drop the `templates` stage, COPY from .tenant-bundle-deps/ instead. The workflow's existing "Pre-clone manifest deps" step (added in #38) already populates .tenant-bundle-deps/ before either build runs, so no workflow change needed. Self-review note: the missed-platform-Dockerfile is a Phase 1 quality miss — I read both files but only registered the tenant one as in-scope. Saved memory `feedback_orchestrator_must_verify_before_declaring_fixed` applies: should have grepped the whole workspace-server/ for "templates" stages before claiming Task #173 done. CI run #794 caught it within ~6 minutes; net cost: one followup commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/Dockerfile | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/workspace-server/Dockerfile b/workspace-server/Dockerfile index dea2e223..3209e78a 100644 --- a/workspace-server/Dockerfile +++ b/workspace-server/Dockerfile @@ -1,7 +1,15 @@ -# Platform-only image (no canvas). Used by publish-platform-image workflow -# for GHCR + Fly registry. Tenant image uses Dockerfile.tenant instead. +# Platform-only image (no canvas). Used by publish-workspace-server-image +# workflow for ECR. Tenant image uses Dockerfile.tenant instead. # -# Build context: repo root. +# Templates + plugins are pre-cloned by scripts/clone-manifest.sh (in CI +# or on the operator host) into .tenant-bundle-deps/ — same pattern as +# Dockerfile.tenant. See that file's header for the full rationale; the +# short version is that post-2026-05-06 every workspace-template-* and +# org-template-* repo on Gitea is private, so an in-image `git clone` +# has no auth path that doesn't leak the Gitea token into a layer. +# +# Build context: repo root, with `.tenant-bundle-deps/` populated by the +# workflow's "Pre-clone manifest deps" step (Task #173). FROM golang:1.25-alpine AS builder WORKDIR /app @@ -26,21 +34,18 @@ RUN CGO_ENABLED=0 GOOS=linux go build \ -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \ -o /memory-plugin ./cmd/memory-plugin-postgres -# Clone templates + plugins at build time from manifest.json -FROM alpine:3.20 AS templates -RUN apk add --no-cache git jq -COPY manifest.json /manifest.json -COPY scripts/clone-manifest.sh /scripts/clone-manifest.sh -RUN chmod +x /scripts/clone-manifest.sh && /scripts/clone-manifest.sh /manifest.json /workspace-configs-templates /org-templates /plugins - FROM alpine:3.20 RUN apk add --no-cache ca-certificates git tzdata wget COPY --from=builder /platform /platform COPY --from=builder /memory-plugin /memory-plugin COPY workspace-server/migrations /migrations -COPY --from=templates /workspace-configs-templates /workspace-configs-templates -COPY --from=templates /org-templates /org-templates -COPY --from=templates /plugins /plugins +# Templates + plugins (pre-cloned by scripts/clone-manifest.sh in the +# trusted CI / operator-host context, .git already stripped). The Gitea +# token used to clone them never enters this image — same shape as +# Dockerfile.tenant. +COPY .tenant-bundle-deps/workspace-configs-templates /workspace-configs-templates +COPY .tenant-bundle-deps/org-templates /org-templates +COPY .tenant-bundle-deps/plugins /plugins # Non-root runtime with Docker socket access for workspace provisioning. RUN addgroup -g 1000 platform && adduser -u 1000 -G platform -s /bin/sh -D platform EXPOSE 8080 -- 2.45.2