Merge pull request #2725 from Molecule-AI/fix/team-expand-routes-via-auto-dispatcher

fix(team): route Expand children through provisionWorkspaceAuto so SaaS gets per-workspace EC2
This commit is contained in:
Hongming Wang 2026-05-04 10:46:44 +00:00 committed by GitHub
commit 1d3d18fd66
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 217 additions and 8 deletions

View File

@ -138,14 +138,23 @@ func (h *TeamHandler) Expand(c *gin.Context) {
// and every other preflight (secrets, env mutators, identity
// injection, missing-env). That left every child with NULL
// platform_inbound_secret and never-issued auth_token. Now
// children go through the same provisionWorkspace path as
// children go through the same provisionWorkspaceAuto path as
// Create/Restart, so adding a future provision-time step
// automatically covers Expand too.
//
// 2026-05-04 follow-up: switched from provisionWorkspace
// (hardcoded Docker) to provisionWorkspaceAuto (picks CP for
// SaaS, Docker for self-hosted). Pre-fix, deploying a team on
// a SaaS tenant created child rows but never an EC2 instance —
// the 600s sweeper logged the misleading "container started
// but never called /registry/register". Templates only own
// shape (config/prompts/files/plugins/runtime); the platform
// owns where it runs.
if h.wh != nil && sub.Config != "" {
templatePath := filepath.Join(h.configsDir, sub.Config)
if _, err := os.Stat(templatePath); err == nil {
parent := parentID // copy for closure
go h.wh.provisionWorkspace(childID, templatePath, nil, models.CreateWorkspacePayload{
h.wh.provisionWorkspaceAuto(childID, templatePath, nil, models.CreateWorkspacePayload{
Name: childName,
Role: sub.Role,
Tier: tier,

View File

@ -96,6 +96,33 @@ func (h *WorkspaceHandler) SetCPProvisioner(cp provisioner.CPProvisionerAPI) {
h.cpProv = cp
}
// provisionWorkspaceAuto picks the backend (CP for SaaS, local Docker
// for self-hosted) and starts provisioning in a goroutine. Returns true
// when a backend was kicked off, false when neither is wired (caller
// owns the persist-config + mark-failed surface in that case).
//
// Centralized so every caller — Create, TeamHandler.Expand, future
// paths — gets the same routing. Pre-2026-05-04 TeamHandler.Expand
// hardcoded provisionWorkspace (Docker) and silently broke the
// "deploy a team on SaaS" flow: child workspace rows were created with
// no EC2 instance, the runtime never ran, and the 600s sweeper logged
// the misleading "container started but never called /registry/register".
//
// Architectural principle: templates own runtime/config/prompts/files/
// plugins; the platform owns where it runs. Anything that picks
// between CP and local Docker belongs in this one helper.
func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
if h.cpProv != nil {
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
return true
}
if h.provisioner != nil {
go h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
return true
}
return false
}
// SetEnvMutators wires a provisionhook.Registry into the handler. Plugins
// living in separate repos register on the same Registry instance during
// boot (see cmd/server/main.go) and main.go calls this setter once before
@ -521,12 +548,15 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
configFiles = h.ensureDefaultConfig(id, payload)
}
// Auto-provision — pick backend: control plane (SaaS) or Docker (self-hosted)
if h.cpProv != nil {
go h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
} else if h.provisioner != nil {
go h.provisionWorkspace(id, templatePath, configFiles, payload)
} else {
// Auto-provision — pick backend: control plane (SaaS) or Docker (self-hosted).
// Routing is centralized in provisionWorkspaceAuto so every caller
// (Create, TeamHandler.Expand, future paths) gets the same backend
// selection. Pre-2026-05-04 the team-deploy path hardcoded the
// Docker route, so on a SaaS tenant 7-of-7 sub-agents were created
// as DB rows but had no EC2 — symptom: "container started but never
// called /registry/register" + diagnose returns "docker client not
// configured". Centralizing here closes that drift class.
if !h.provisionWorkspaceAuto(id, templatePath, configFiles, payload) {
// No Docker available (SaaS tenant). Persist basic config as JSON
// so the Config tab shows the correct runtime/model/name. Then mark
// the workspace as failed with a clear message.

View File

@ -0,0 +1,170 @@
package handlers
// Pins the backend-dispatcher invariant added 2026-05-04.
//
// Before the fix, TeamHandler.Expand hardcoded the Docker provisioner
// (provisionWorkspace), so on a SaaS tenant where the workspace-server
// has no docker socket, child workspaces were created as DB rows but
// never got an EC2 instance. The 600s sweeper then logged the misleading
// "container started but never called /registry/register".
//
// The fix centralizes backend selection in
// WorkspaceHandler.provisionWorkspaceAuto and routes both Create and
// TeamHandler.Expand through it. These tests pin:
//
// 1. Auto returns false when neither backend is wired (caller must
// persist + mark-failed itself).
// 2. Auto picks CP when cpProv is set.
// 3. team.go uses provisionWorkspaceAuto, not provisionWorkspace
// directly (source-level guard against the original drift).
import (
"bytes"
"context"
"errors"
"os"
"path/filepath"
"sync"
"testing"
"time"
"github.com/DATA-DOG/go-sqlmock"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
)
// trackingCPProv records every Start() call in a thread-safe slice.
// Defined locally to avoid coupling this test to the recordingCPProv
// in workspace_provision_concurrent_repro_test.go (whose Stop/etc.
// methods panic — fine there, would be noise here).
type trackingCPProv struct {
mu sync.Mutex
started []string
startErr error
}
func (r *trackingCPProv) Start(_ context.Context, cfg provisioner.WorkspaceConfig) (string, error) {
r.mu.Lock()
r.started = append(r.started, cfg.WorkspaceID)
r.mu.Unlock()
if r.startErr != nil {
return "", r.startErr
}
return "i-stub-" + cfg.WorkspaceID, nil
}
func (r *trackingCPProv) Stop(_ context.Context, _ string) error { return nil }
func (r *trackingCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) {
return "", nil
}
func (r *trackingCPProv) IsRunning(_ context.Context, _ string) (bool, error) { return true, nil }
func (r *trackingCPProv) startedSnapshot() []string {
r.mu.Lock()
defer r.mu.Unlock()
out := make([]string, len(r.started))
copy(out, r.started)
return out
}
// TestProvisionWorkspaceAuto_NoBackendReturnsFalse — when neither
// cpProv nor provisioner is wired, the dispatcher returns false so the
// caller knows it must own the persist + mark-failed path. Pre-fix,
// TeamHandler had no equivalent fallback at all and silently dropped
// children on the floor.
func TestProvisionWorkspaceAuto_NoBackendReturnsFalse(t *testing.T) {
bcast := &concurrentSafeBroadcaster{}
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
// Do NOT call SetCPProvisioner — both backends nil.
ok := h.provisionWorkspaceAuto("ws-noback", "", nil, models.CreateWorkspacePayload{
Name: "noback", Tier: 1, Runtime: "claude-code",
})
if ok {
t.Fatalf("expected provisionWorkspaceAuto to return false with no backend wired")
}
}
// TestProvisionWorkspaceAuto_RoutesToCPWhenSet — when cpProv is set
// (SaaS tenant), Auto MUST route there. CP wins because per-workspace
// EC2 is the SaaS path; Docker would silently fail "no docker socket"
// on the tenant EC2.
//
// This is the regression-prevention test for the Design Director bug
// where 7-of-7 sub-agents went down the Docker path on SaaS.
func TestProvisionWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) {
mock := setupTestDB(t)
mock.MatchExpectationsInOrder(false)
// provisionWorkspaceCP runs in the goroutine and will hit:
// secrets SELECTs + UPDATE workspace as failed (because we make
// CP Start return an error to short-circuit the rest of the path).
mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`).
WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM workspace_secrets`).
WithArgs(sqlmock.AnyArg()).
WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
rec := &trackingCPProv{startErr: errors.New("simulated CP rejection")}
bcast := &concurrentSafeBroadcaster{}
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
h.SetCPProvisioner(rec)
wsID := "ws-routes-to-cp-0123456789abcdef"
ok := h.provisionWorkspaceAuto(wsID, "", nil, models.CreateWorkspacePayload{
Name: "test", Tier: 1, Runtime: "claude-code",
})
if !ok {
t.Fatalf("expected provisionWorkspaceAuto to return true with CP wired")
}
// Wait for the goroutine to land in cpProv.Start (or give up).
deadline := time.Now().Add(2 * time.Second)
for {
if len(rec.startedSnapshot()) > 0 {
break
}
if time.Now().After(deadline) {
t.Fatalf("timed out waiting for cpProv.Start; recorded=%v", rec.startedSnapshot())
}
time.Sleep(20 * time.Millisecond)
}
got := rec.startedSnapshot()
if len(got) != 1 || got[0] != wsID {
t.Errorf("expected cpProv.Start invoked once with %q, got %v", wsID, got)
}
}
// TestTeamExpand_UsesAutoNotDirectDockerPath — source-level guard: if
// a future refactor reintroduces a hardcoded `h.wh.provisionWorkspace`
// call in team.go, this fails. Pre-fix the hardcoded call was the bug.
//
// Substring match on the source rather than AST because the failure
// shape is "wrong function name" — a plain text gate suffices.
// Per `feedback_behavior_based_ast_gates.md` we'd usually pin the
// behavior, but the behavior here ("calls dispatcher, not dispatcher's
// docker leg") is awkward to assert without standing up the entire
// Expand stack — the auto test above covers the dispatcher behavior;
// this test is the cheap source-level seatbelt for the call site.
func TestTeamExpand_UsesAutoNotDirectDockerPath(t *testing.T) {
wd, err := os.Getwd()
if err != nil {
t.Fatalf("getwd: %v", err)
}
src, err := os.ReadFile(filepath.Join(wd, "team.go"))
if err != nil {
t.Fatalf("read team.go: %v", err)
}
if bytes.Contains(src, []byte("h.wh.provisionWorkspace(")) {
t.Errorf("team.go calls h.wh.provisionWorkspace directly — must use h.wh.provisionWorkspaceAuto so SaaS tenants route to CP. " +
"Pre-2026-05-04 the direct call sent every team child down the Docker path on SaaS, " +
"creating workspace rows with no EC2 instance.")
}
if !bytes.Contains(src, []byte("h.wh.provisionWorkspaceAuto(")) {
t.Errorf("team.go must call h.wh.provisionWorkspaceAuto for child provisioning — current code does not")
}
}