From 17f1f30b3ffbc3f587ca01eed1d02aae0d438e63 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Thu, 7 May 2026 13:04:57 -0700
Subject: [PATCH 1/2] fix(test): drain coalesceRestart goroutines before
 t.Cleanup (Class H, #170)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TestPooledWithEICTunnel_PreservesFnErr (and any sqlmock-using neighbour
test) was at risk of inheriting stale INSERT calls from a previous
test's coalesceRestart goroutine that survived its t.Cleanup boundary.

The production callsite shape is `go h.RestartByID(...)` from
a2a_proxy.go, a2a_proxy_helpers.go and main.go. When that goroutine's
runRestartCycle panics, coalesceRestart's deferred recover swallows it
to keep the platform process alive — but in tests, nothing waits for
the goroutine to fully exit. If it's still draining LogActivity-shaped
work after the test returns, those INSERTs land in the next test's
sqlmock connection as kind=DELEGATION_FAILED /
kind=WORKSPACE_PROVISION_FAILED, surfacing as "INSERT-not-expected".

Fix: introduce drainCoalesceGoroutine(t, wsID, cycle) test helper that
spawns coalesceRestart on a goroutine (matching production) and
registers a t.Cleanup with sync.WaitGroup.Wait so the test can't
declare itself done while a goroutine is still alive.

Convert TestCoalesceRestart_PanicInCycleClearsState to use the helper
(previously it called coalesceRestart synchronously, which never
exercised the production goroutine-survival contract).

Add TestCoalesceRestart_DrainHelperWaitsForGoroutineExit as the
regression guard: cycle blocks 150ms then panics; the test asserts
t.Run elapsed >= 150ms (proving the Wait barrier engaged) AND the
deferred close ran (proving the panic-recovery defer chain executed)
AND state.running was cleared. Verified the assertion is real by
mutation-testing: removing t.Cleanup(wg.Wait) makes this test FAIL
deterministically with elapsed <300µs.

Per saved memory feedback_assert_exact_not_substring: the regression
test asserts an exact-shape contract (elapsed >= blockFor) rather than
a substring-in-output, so it discriminates between "drain works" and
"drain skipped".

Per Phase 3: 10/10 race-detector runs pass for all TestCoalesceRestart_*
tests. Full ./internal/handlers/... suite green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workspace_restart_coalesce_test.go        | 173 +++++++++++++++++-
 1 file changed, 164 insertions(+), 9 deletions(-)

diff --git a/workspace-server/internal/handlers/workspace_restart_coalesce_test.go b/workspace-server/internal/handlers/workspace_restart_coalesce_test.go
index 39e14219..e21147c5 100644
--- a/workspace-server/internal/handlers/workspace_restart_coalesce_test.go
+++ b/workspace-server/internal/handlers/workspace_restart_coalesce_test.go
@@ -1,6 +1,7 @@
 package handlers
 
 import (
+	"runtime"
 	"sync"
 	"sync/atomic"
 	"testing"
@@ -15,6 +16,42 @@ func resetRestartStatesFor(workspaceID string) {
 	restartStates.Delete(workspaceID)
 }
 
+// drainCoalesceGoroutine spawns `coalesceRestart(wsID, cycle)` on a
+// goroutine that mirrors the real production caller shape
+// (`go h.RestartByID(...)` from a2a_proxy.go, a2a_proxy_helpers.go,
+// main.go), and registers a t.Cleanup that blocks until the goroutine
+// has TERMINATED — not just panicked-and-recovered, fully exited.
+//
+// This is the bleed-prevention contract for Class H (Task #170): no
+// test in this file may declare itself complete while a coalesceRestart
+// goroutine it spawned is still alive, because that goroutine could
+// otherwise wake up after the test's sqlmock has been closed and
+// either:
+//   - issue a stale INSERT that gets attributed to the next test's
+//     sqlmock connection — surfaces as
+//     "INSERT-not-expected for kind=DELEGATION_FAILED" / =WORKSPACE_PROVISION_FAILED
+//     in a neighbour test that doesn't itself touch coalesceRestart; or
+//   - hold a reference to the closed *sql.DB and panic on the next op.
+//
+// Implementation notes:
+//   - sync.WaitGroup must be Add()ed BEFORE the goroutine is spawned;
+//     Add inside the goroutine races with Wait.
+//   - t.Cleanup runs in LIFO order, so this composes safely with other
+//     cleanups (e.g. setupTestDB's mockDB.Close).
+//   - We don't bound the Wait with a timeout — if the goroutine
+//     genuinely deadlocks, the whole test process should hang and fail
+//     under -timeout. A timeout-then-orphan would mask the bleed.
+func drainCoalesceGoroutine(t *testing.T, wsID string, cycle func()) {
+	t.Helper()
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		coalesceRestart(wsID, cycle)
+	}()
+	t.Cleanup(wg.Wait)
+}
+
 // TestCoalesceRestart_SingleCallRunsOneCycle is the baseline:
 // no concurrency, one cycle. If this fails the gate logic is broken at
 // its simplest path.
@@ -200,19 +237,45 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
 	const wsID = "test-coalesce-panic-recovery"
 	resetRestartStatesFor(wsID)
 
-	// First call's cycle panics. coalesceRestart's defer must swallow
-	// the panic so this test caller doesn't see it propagate up — that
-	// matches what the real production caller (`go h.RestartByID(...)`)
-	// gets: the goroutine survives, no process crash.
-	defer func() {
-		if r := recover(); r != nil {
-			t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r)
+	// Spawn the panicking cycle on a goroutine via drainCoalesceGoroutine
+	// — this mirrors the real production callsite shape
+	// (`go h.RestartByID(...)` from a2a_proxy.go:584,
+	// a2a_proxy_helpers.go:197, main.go:213). The previous form called
+	// coalesceRestart synchronously, which neither exercised the
+	// goroutine-survival contract nor caught Class H bleed regressions
+	// where the panic-recovery goroutine outlives the test and pollutes
+	// the next test's sqlmock with INSERTs from runRestartCycle's
+	// LogActivity calls (kinds DELEGATION_FAILED / WORKSPACE_PROVISION_FAILED).
+	//
+	// drainCoalesceGoroutine registers a t.Cleanup that Wait()s for the
+	// goroutine to TERMINATE — not merely panic-and-recover — before
+	// the test ends.
+	drainCoalesceGoroutine(t, wsID, func() { panic("simulated cycle failure") })
+
+	// We need a mid-test barrier (not just the t.Cleanup-time barrier)
+	// so the second coalesceRestart below sees state.running=false. The
+	// goroutine clears state.running inside its deferred recover; poll
+	// the package-level restartStates map until that observable flip
+	// happens. Bound at 2s — longer = real bug.
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		sv, ok := restartStates.Load(wsID)
+		if ok {
+			st := sv.(*restartState)
+			st.mu.Lock()
+			running := st.running
+			st.mu.Unlock()
+			if !running {
+				break
+			}
 		}
-	}()
-	coalesceRestart(wsID, func() { panic("simulated cycle failure") })
+		time.Sleep(time.Millisecond)
+	}
 
 	// Second call must run a fresh cycle. If running stayed true after
 	// the panic, this call would early-return without invoking cycle.
+	// Synchronous — no panic, so no goroutine to drain, and we want to
+	// assert ran.Load() immediately after.
 	var ran atomic.Bool
 	coalesceRestart(wsID, func() { ran.Store(true) })
 	if !ran.Load() {
@@ -220,6 +283,98 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
 	}
 }
 
+// TestCoalesceRestart_DrainHelperWaitsForGoroutineExit is the Class H
+// regression guard for Task #170. It asserts the contract enforced by
+// drainCoalesceGoroutine: t.Cleanup blocks until the spawned
+// coalesceRestart goroutine has FULLY EXITED — not merely recovered
+// from panic. This is the contract that prevents stale LogActivity
+// INSERTs from a recovering goroutine bleeding into the next test's
+// sqlmock (the failure mode reported as "INSERT-not-expected for
+// kind=DELEGATION_FAILED" in TestPooledWithEICTunnel_PreservesFnErr).
+//
+// We use a deterministic bleed-shape probe rather than goroutine-count
+// arithmetic: the cycle blocks on a release channel for ~150ms — long
+// enough that without a Wait barrier, the outer sub-test would return
+// before the goroutine exited. We then verify the wg.Wait inside
+// drainCoalesceGoroutine actually delayed t.Run's completion: total
+// elapsed must be >= the block duration. Asserts exact-shape, not
+// substring (per saved-memory feedback_assert_exact_not_substring):
+// elapsed < blockFor would mean the cleanup didn't wait, which is the
+// exact bleed we're guarding against.
+//
+// We additionally panic from the cycle (after the block) to confirm
+// the helper waits past panic recovery, not just past cycle return.
+func TestCoalesceRestart_DrainHelperWaitsForGoroutineExit(t *testing.T) {
+	const blockFor = 150 * time.Millisecond
+	const wsID = "test-coalesce-drain-helper-contract"
+	resetRestartStatesFor(wsID)
+
+	// done is closed inside the cycle, AFTER the block + AFTER the
+	// panic (which the deferred recover in coalesceRestart catches).
+	// Actually: defer in cycle runs before panic propagates to the
+	// outer recover. Use defer to close.
+	exited := make(chan struct{})
+
+	subStart := time.Now()
+	t.Run("drain_under_subtest", func(st *testing.T) {
+		drainCoalesceGoroutine(st, wsID, func() {
+			defer close(exited)
+			time.Sleep(blockFor)
+			panic("contract-test panic-after-block")
+		})
+		// st.Cleanup runs here, before t.Run returns. wg.Wait must
+		// block until the goroutine has finished its panic recovery.
+	})
+	subElapsed := time.Since(subStart)
+
+	// Contract: the helper's wg.Wait MUST have blocked t.Run from
+	// returning until after the cycle's block + panic recovery.
+	if subElapsed < blockFor {
+		t.Fatalf(
+			"drainCoalesceGoroutine contract violated: t.Run returned in %v, "+
+				"but cycle blocks for %v. The Wait barrier is broken — a "+
+				"coalesceRestart goroutine can outlive its test's t.Cleanup "+
+				"and pollute neighbour-test sqlmock state (Class H bleed).",
+			subElapsed, blockFor,
+		)
+	}
+
+	// And the goroutine must have actually closed `exited` (i.e. ran
+	// the deferred close before panic propagated through coalesceRestart's
+	// recover). If exited is still open here, the goroutine never
+	// reached the close — meaning either the panic short-circuited the
+	// defer (Go runtime bug — won't happen) or the goroutine never
+	// ran at all (drainCoalesceGoroutine spawn shape regressed).
+	select {
+	case <-exited:
+		// Correct path.
+	default:
+		t.Fatal("cycle goroutine never reached its deferred close — panic-recovery contract regressed")
+	}
+
+	// Belt-and-suspenders: the post-recover state-clear must have
+	// flipped state.running back to false. If this fails, the panic
+	// path skipped the deferred state-clear in coalesceRestart.
+	sv, ok := restartStates.Load(wsID)
+	if !ok {
+		t.Fatal("restartStates entry missing for wsID after cycle — sync.Map regression")
+	}
+	st := sv.(*restartState)
+	st.mu.Lock()
+	running := st.running
+	st.mu.Unlock()
+	if running {
+		t.Error("state.running was not cleared after panic — sticky-running deadlock regressed")
+	}
+
+	// Reference runtime.NumGoroutine to keep the runtime import
+	// honest — also a useful smoke check that the goroutine count
+	// hasn't ballooned 10x while debugging this test.
+	if n := runtime.NumGoroutine(); n > 200 {
+		t.Logf("warning: NumGoroutine=%d after drain — high but not necessarily a leak", n)
+	}
+}
+
 // TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the
 // per-workspace state map: an in-flight restart for ws A must not
 // block restarts for ws B. Important for performance — without this,
-- 
2.45.2


From e16d7eaa08ab567f1131d19ff056793b72b968cc Mon Sep 17 00:00:00 2001
From: devops-engineer <devops-engineer@molecule.ai>
Date: Thu, 7 May 2026 13:08:36 -0700
Subject: [PATCH 2/2] fix(ci): apply pre-clone fix to platform Dockerfile too
 (followup #173)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The first PR (#38) only patched Dockerfile.tenant — but the workflow
also builds the platform image from workspace-server/Dockerfile, which
had the SAME in-image `git clone` stage. Build run #794 caught this:
"process clone-manifest.sh ... exit code 128" on the platform image.

Apply the same pre-clone shape to the platform Dockerfile: drop the
`templates` stage, COPY from .tenant-bundle-deps/ instead. The
workflow's existing "Pre-clone manifest deps" step (added in #38)
already populates .tenant-bundle-deps/ before either build runs, so no
workflow change needed.

Self-review note: the missed-platform-Dockerfile is a Phase 1 quality
miss — I read both files but only registered the tenant one as
in-scope. Saved memory `feedback_orchestrator_must_verify_before_declaring_fixed`
applies: should have grepped the whole workspace-server/ for "templates"
stages before claiming Task #173 done. CI run #794 caught it within
~6 minutes; net cost: one followup commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace-server/Dockerfile | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/workspace-server/Dockerfile b/workspace-server/Dockerfile
index dea2e223..3209e78a 100644
--- a/workspace-server/Dockerfile
+++ b/workspace-server/Dockerfile
@@ -1,7 +1,15 @@
-# Platform-only image (no canvas). Used by publish-platform-image workflow
-# for GHCR + Fly registry. Tenant image uses Dockerfile.tenant instead.
+# Platform-only image (no canvas). Used by publish-workspace-server-image
+# workflow for ECR. Tenant image uses Dockerfile.tenant instead.
 #
-# Build context: repo root.
+# Templates + plugins are pre-cloned by scripts/clone-manifest.sh (in CI
+# or on the operator host) into .tenant-bundle-deps/ — same pattern as
+# Dockerfile.tenant. See that file's header for the full rationale; the
+# short version is that post-2026-05-06 every workspace-template-* and
+# org-template-* repo on Gitea is private, so an in-image `git clone`
+# has no auth path that doesn't leak the Gitea token into a layer.
+#
+# Build context: repo root, with `.tenant-bundle-deps/` populated by the
+# workflow's "Pre-clone manifest deps" step (Task #173).
 
 FROM golang:1.25-alpine AS builder
 WORKDIR /app
@@ -26,21 +34,18 @@ RUN CGO_ENABLED=0 GOOS=linux go build \
     -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
     -o /memory-plugin ./cmd/memory-plugin-postgres
 
-# Clone templates + plugins at build time from manifest.json
-FROM alpine:3.20 AS templates
-RUN apk add --no-cache git jq
-COPY manifest.json /manifest.json
-COPY scripts/clone-manifest.sh /scripts/clone-manifest.sh
-RUN chmod +x /scripts/clone-manifest.sh && /scripts/clone-manifest.sh /manifest.json /workspace-configs-templates /org-templates /plugins
-
 FROM alpine:3.20
 RUN apk add --no-cache ca-certificates git tzdata wget
 COPY --from=builder /platform /platform
 COPY --from=builder /memory-plugin /memory-plugin
 COPY workspace-server/migrations /migrations
-COPY --from=templates /workspace-configs-templates /workspace-configs-templates
-COPY --from=templates /org-templates /org-templates
-COPY --from=templates /plugins /plugins
+# Templates + plugins (pre-cloned by scripts/clone-manifest.sh in the
+# trusted CI / operator-host context, .git already stripped). The Gitea
+# token used to clone them never enters this image — same shape as
+# Dockerfile.tenant.
+COPY .tenant-bundle-deps/workspace-configs-templates /workspace-configs-templates
+COPY .tenant-bundle-deps/org-templates /org-templates
+COPY .tenant-bundle-deps/plugins /plugins
 # Non-root runtime with Docker socket access for workspace provisioning.
 RUN addgroup -g 1000 platform && adduser -u 1000 -G platform -s /bin/sh -D platform
 EXPOSE 8080
-- 
2.45.2