forked from molecule-ai/molecule-core
Merge pull request #2725 from Molecule-AI/fix/team-expand-routes-via-auto-dispatcher
fix(team): route Expand children through provisionWorkspaceAuto so SaaS gets per-workspace EC2
This commit is contained in:
commit
1d3d18fd66
@ -138,14 +138,23 @@ func (h *TeamHandler) Expand(c *gin.Context) {
|
||||
// and every other preflight (secrets, env mutators, identity
|
||||
// injection, missing-env). That left every child with NULL
|
||||
// platform_inbound_secret and never-issued auth_token. Now
|
||||
// children go through the same provisionWorkspace path as
|
||||
// children go through the same provisionWorkspaceAuto path as
|
||||
// Create/Restart, so adding a future provision-time step
|
||||
// automatically covers Expand too.
|
||||
//
|
||||
// 2026-05-04 follow-up: switched from provisionWorkspace
|
||||
// (hardcoded Docker) to provisionWorkspaceAuto (picks CP for
|
||||
// SaaS, Docker for self-hosted). Pre-fix, deploying a team on
|
||||
// a SaaS tenant created child rows but never an EC2 instance —
|
||||
// the 600s sweeper logged the misleading "container started
|
||||
// but never called /registry/register". Templates only own
|
||||
// shape (config/prompts/files/plugins/runtime); the platform
|
||||
// owns where it runs.
|
||||
if h.wh != nil && sub.Config != "" {
|
||||
templatePath := filepath.Join(h.configsDir, sub.Config)
|
||||
if _, err := os.Stat(templatePath); err == nil {
|
||||
parent := parentID // copy for closure
|
||||
go h.wh.provisionWorkspace(childID, templatePath, nil, models.CreateWorkspacePayload{
|
||||
h.wh.provisionWorkspaceAuto(childID, templatePath, nil, models.CreateWorkspacePayload{
|
||||
Name: childName,
|
||||
Role: sub.Role,
|
||||
Tier: tier,
|
||||
|
||||
@ -96,6 +96,33 @@ func (h *WorkspaceHandler) SetCPProvisioner(cp provisioner.CPProvisionerAPI) {
|
||||
h.cpProv = cp
|
||||
}
|
||||
|
||||
// provisionWorkspaceAuto picks the backend (CP for SaaS, local Docker
|
||||
// for self-hosted) and starts provisioning in a goroutine. Returns true
|
||||
// when a backend was kicked off, false when neither is wired (caller
|
||||
// owns the persist-config + mark-failed surface in that case).
|
||||
//
|
||||
// Centralized so every caller — Create, TeamHandler.Expand, future
|
||||
// paths — gets the same routing. Pre-2026-05-04 TeamHandler.Expand
|
||||
// hardcoded provisionWorkspace (Docker) and silently broke the
|
||||
// "deploy a team on SaaS" flow: child workspace rows were created with
|
||||
// no EC2 instance, the runtime never ran, and the 600s sweeper logged
|
||||
// the misleading "container started but never called /registry/register".
|
||||
//
|
||||
// Architectural principle: templates own runtime/config/prompts/files/
|
||||
// plugins; the platform owns where it runs. Anything that picks
|
||||
// between CP and local Docker belongs in this one helper.
|
||||
func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
|
||||
if h.cpProv != nil {
|
||||
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
go h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// SetEnvMutators wires a provisionhook.Registry into the handler. Plugins
|
||||
// living in separate repos register on the same Registry instance during
|
||||
// boot (see cmd/server/main.go) and main.go calls this setter once before
|
||||
@ -521,12 +548,15 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
|
||||
configFiles = h.ensureDefaultConfig(id, payload)
|
||||
}
|
||||
|
||||
// Auto-provision — pick backend: control plane (SaaS) or Docker (self-hosted)
|
||||
if h.cpProv != nil {
|
||||
go h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
|
||||
} else if h.provisioner != nil {
|
||||
go h.provisionWorkspace(id, templatePath, configFiles, payload)
|
||||
} else {
|
||||
// Auto-provision — pick backend: control plane (SaaS) or Docker (self-hosted).
|
||||
// Routing is centralized in provisionWorkspaceAuto so every caller
|
||||
// (Create, TeamHandler.Expand, future paths) gets the same backend
|
||||
// selection. Pre-2026-05-04 the team-deploy path hardcoded the
|
||||
// Docker route, so on a SaaS tenant 7-of-7 sub-agents were created
|
||||
// as DB rows but had no EC2 — symptom: "container started but never
|
||||
// called /registry/register" + diagnose returns "docker client not
|
||||
// configured". Centralizing here closes that drift class.
|
||||
if !h.provisionWorkspaceAuto(id, templatePath, configFiles, payload) {
|
||||
// No Docker available (SaaS tenant). Persist basic config as JSON
|
||||
// so the Config tab shows the correct runtime/model/name. Then mark
|
||||
// the workspace as failed with a clear message.
|
||||
|
||||
@ -0,0 +1,170 @@
|
||||
package handlers
|
||||
|
||||
// Pins the backend-dispatcher invariant added 2026-05-04.
|
||||
//
|
||||
// Before the fix, TeamHandler.Expand hardcoded the Docker provisioner
|
||||
// (provisionWorkspace), so on a SaaS tenant where the workspace-server
|
||||
// has no docker socket, child workspaces were created as DB rows but
|
||||
// never got an EC2 instance. The 600s sweeper then logged the misleading
|
||||
// "container started but never called /registry/register".
|
||||
//
|
||||
// The fix centralizes backend selection in
|
||||
// WorkspaceHandler.provisionWorkspaceAuto and routes both Create and
|
||||
// TeamHandler.Expand through it. These tests pin:
|
||||
//
|
||||
// 1. Auto returns false when neither backend is wired (caller must
|
||||
// persist + mark-failed itself).
|
||||
// 2. Auto picks CP when cpProv is set.
|
||||
// 3. team.go uses provisionWorkspaceAuto, not provisionWorkspace
|
||||
// directly (source-level guard against the original drift).
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
)
|
||||
|
||||
// trackingCPProv records every Start() call in a thread-safe slice.
|
||||
// Defined locally to avoid coupling this test to the recordingCPProv
|
||||
// in workspace_provision_concurrent_repro_test.go (whose Stop/etc.
|
||||
// methods panic — fine there, would be noise here).
|
||||
type trackingCPProv struct {
|
||||
mu sync.Mutex
|
||||
started []string
|
||||
startErr error
|
||||
}
|
||||
|
||||
func (r *trackingCPProv) Start(_ context.Context, cfg provisioner.WorkspaceConfig) (string, error) {
|
||||
r.mu.Lock()
|
||||
r.started = append(r.started, cfg.WorkspaceID)
|
||||
r.mu.Unlock()
|
||||
if r.startErr != nil {
|
||||
return "", r.startErr
|
||||
}
|
||||
return "i-stub-" + cfg.WorkspaceID, nil
|
||||
}
|
||||
func (r *trackingCPProv) Stop(_ context.Context, _ string) error { return nil }
|
||||
func (r *trackingCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (r *trackingCPProv) IsRunning(_ context.Context, _ string) (bool, error) { return true, nil }
|
||||
|
||||
func (r *trackingCPProv) startedSnapshot() []string {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
out := make([]string, len(r.started))
|
||||
copy(out, r.started)
|
||||
return out
|
||||
}
|
||||
|
||||
// TestProvisionWorkspaceAuto_NoBackendReturnsFalse — when neither
|
||||
// cpProv nor provisioner is wired, the dispatcher returns false so the
|
||||
// caller knows it must own the persist + mark-failed path. Pre-fix,
|
||||
// TeamHandler had no equivalent fallback at all and silently dropped
|
||||
// children on the floor.
|
||||
func TestProvisionWorkspaceAuto_NoBackendReturnsFalse(t *testing.T) {
|
||||
bcast := &concurrentSafeBroadcaster{}
|
||||
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
|
||||
// Do NOT call SetCPProvisioner — both backends nil.
|
||||
|
||||
ok := h.provisionWorkspaceAuto("ws-noback", "", nil, models.CreateWorkspacePayload{
|
||||
Name: "noback", Tier: 1, Runtime: "claude-code",
|
||||
})
|
||||
if ok {
|
||||
t.Fatalf("expected provisionWorkspaceAuto to return false with no backend wired")
|
||||
}
|
||||
}
|
||||
|
||||
// TestProvisionWorkspaceAuto_RoutesToCPWhenSet — when cpProv is set
|
||||
// (SaaS tenant), Auto MUST route there. CP wins because per-workspace
|
||||
// EC2 is the SaaS path; Docker would silently fail "no docker socket"
|
||||
// on the tenant EC2.
|
||||
//
|
||||
// This is the regression-prevention test for the Design Director bug
|
||||
// where 7-of-7 sub-agents went down the Docker path on SaaS.
|
||||
func TestProvisionWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
mock.MatchExpectationsInOrder(false)
|
||||
|
||||
// provisionWorkspaceCP runs in the goroutine and will hit:
|
||||
// secrets SELECTs + UPDATE workspace as failed (because we make
|
||||
// CP Start return an error to short-circuit the rest of the path).
|
||||
mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
|
||||
mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM workspace_secrets`).
|
||||
WithArgs(sqlmock.AnyArg()).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
|
||||
mock.ExpectExec(`UPDATE workspaces SET status =`).
|
||||
WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
rec := &trackingCPProv{startErr: errors.New("simulated CP rejection")}
|
||||
bcast := &concurrentSafeBroadcaster{}
|
||||
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
|
||||
h.SetCPProvisioner(rec)
|
||||
|
||||
wsID := "ws-routes-to-cp-0123456789abcdef"
|
||||
ok := h.provisionWorkspaceAuto(wsID, "", nil, models.CreateWorkspacePayload{
|
||||
Name: "test", Tier: 1, Runtime: "claude-code",
|
||||
})
|
||||
if !ok {
|
||||
t.Fatalf("expected provisionWorkspaceAuto to return true with CP wired")
|
||||
}
|
||||
|
||||
// Wait for the goroutine to land in cpProv.Start (or give up).
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for {
|
||||
if len(rec.startedSnapshot()) > 0 {
|
||||
break
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("timed out waiting for cpProv.Start; recorded=%v", rec.startedSnapshot())
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
|
||||
got := rec.startedSnapshot()
|
||||
if len(got) != 1 || got[0] != wsID {
|
||||
t.Errorf("expected cpProv.Start invoked once with %q, got %v", wsID, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestTeamExpand_UsesAutoNotDirectDockerPath — source-level guard: if
|
||||
// a future refactor reintroduces a hardcoded `h.wh.provisionWorkspace`
|
||||
// call in team.go, this fails. Pre-fix the hardcoded call was the bug.
|
||||
//
|
||||
// Substring match on the source rather than AST because the failure
|
||||
// shape is "wrong function name" — a plain text gate suffices.
|
||||
// Per `feedback_behavior_based_ast_gates.md` we'd usually pin the
|
||||
// behavior, but the behavior here ("calls dispatcher, not dispatcher's
|
||||
// docker leg") is awkward to assert without standing up the entire
|
||||
// Expand stack — the auto test above covers the dispatcher behavior;
|
||||
// this test is the cheap source-level seatbelt for the call site.
|
||||
func TestTeamExpand_UsesAutoNotDirectDockerPath(t *testing.T) {
|
||||
wd, err := os.Getwd()
|
||||
if err != nil {
|
||||
t.Fatalf("getwd: %v", err)
|
||||
}
|
||||
src, err := os.ReadFile(filepath.Join(wd, "team.go"))
|
||||
if err != nil {
|
||||
t.Fatalf("read team.go: %v", err)
|
||||
}
|
||||
if bytes.Contains(src, []byte("h.wh.provisionWorkspace(")) {
|
||||
t.Errorf("team.go calls h.wh.provisionWorkspace directly — must use h.wh.provisionWorkspaceAuto so SaaS tenants route to CP. " +
|
||||
"Pre-2026-05-04 the direct call sent every team child down the Docker path on SaaS, " +
|
||||
"creating workspace rows with no EC2 instance.")
|
||||
}
|
||||
if !bytes.Contains(src, []byte("h.wh.provisionWorkspaceAuto(")) {
|
||||
t.Errorf("team.go must call h.wh.provisionWorkspaceAuto for child provisioning — current code does not")
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user