From be997883c98b1a714e21c59e599890b4260da6ea Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 03:43:41 -0700 Subject: [PATCH] Centralize backend selection in provisionWorkspaceAuto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User-reported 2026-05-04: deploying a team org-template ("Design Director" + 6 sub-agents) on a SaaS tenant produced 7-of-7 WORKSPACE_PROVISION_FAILED with the misleading message "container started but never called /registry/register". Diagnose returned "docker client not configured on this workspace-server" and the workspace rows had no instance_id. Root cause: TeamHandler.Expand hardcoded h.wh.provisionWorkspace — the Docker leg of WorkspaceHandler. WorkspaceHandler.Create branched on h.cpProv to pick CP-managed EC2 (SaaS) vs local Docker (self-hosted), but Expand never used that branch. On SaaS the docker goroutine ran but had no socket, so children silently sat in "provisioning" until the 600s sweeper marked them failed. Architectural principle (user): templates own runtime/config/prompts/files/plugins; the platform owns where it runs. Backend selection belongs in one helper. Fix: - Extract WorkspaceHandler.provisionWorkspaceAuto: picks CP when cpProv is set, Docker when only provisioner is set, returns false when neither (caller marks failed). - WorkspaceHandler.Create routes through Auto. - TeamHandler.Expand routes through Auto. Tests pin three invariants: - TestProvisionWorkspaceAuto_NoBackendReturnsFalse — Auto signals fall-through correctly so the caller can persist + mark-failed. - TestProvisionWorkspaceAuto_RoutesToCPWhenSet — when cpProv is wired, Start lands on CP (the user-visible regression target). Discipline-verified: removing the cpProv branch fails this. - TestTeamExpand_UsesAutoNotDirectDockerPath — source-level guard against future refactors reintroducing the hardcoded Docker call. Discipline-verified: reverting team.go fails this with a clear message naming the bug class. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/internal/handlers/team.go | 13 +- .../internal/handlers/workspace.go | 42 ++++- .../handlers/workspace_provision_auto_test.go | 170 ++++++++++++++++++ 3 files changed, 217 insertions(+), 8 deletions(-) create mode 100644 workspace-server/internal/handlers/workspace_provision_auto_test.go diff --git a/workspace-server/internal/handlers/team.go b/workspace-server/internal/handlers/team.go index 7f3c605c..c4a481f9 100644 --- a/workspace-server/internal/handlers/team.go +++ b/workspace-server/internal/handlers/team.go @@ -138,14 +138,23 @@ func (h *TeamHandler) Expand(c *gin.Context) { // and every other preflight (secrets, env mutators, identity // injection, missing-env). That left every child with NULL // platform_inbound_secret and never-issued auth_token. Now - // children go through the same provisionWorkspace path as + // children go through the same provisionWorkspaceAuto path as // Create/Restart, so adding a future provision-time step // automatically covers Expand too. + // + // 2026-05-04 follow-up: switched from provisionWorkspace + // (hardcoded Docker) to provisionWorkspaceAuto (picks CP for + // SaaS, Docker for self-hosted). Pre-fix, deploying a team on + // a SaaS tenant created child rows but never an EC2 instance — + // the 600s sweeper logged the misleading "container started + // but never called /registry/register". Templates only own + // shape (config/prompts/files/plugins/runtime); the platform + // owns where it runs. if h.wh != nil && sub.Config != "" { templatePath := filepath.Join(h.configsDir, sub.Config) if _, err := os.Stat(templatePath); err == nil { parent := parentID // copy for closure - go h.wh.provisionWorkspace(childID, templatePath, nil, models.CreateWorkspacePayload{ + h.wh.provisionWorkspaceAuto(childID, templatePath, nil, models.CreateWorkspacePayload{ Name: childName, Role: sub.Role, Tier: tier, diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 62081512..2f640d77 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -96,6 +96,33 @@ func (h *WorkspaceHandler) SetCPProvisioner(cp provisioner.CPProvisionerAPI) { h.cpProv = cp } +// provisionWorkspaceAuto picks the backend (CP for SaaS, local Docker +// for self-hosted) and starts provisioning in a goroutine. Returns true +// when a backend was kicked off, false when neither is wired (caller +// owns the persist-config + mark-failed surface in that case). +// +// Centralized so every caller — Create, TeamHandler.Expand, future +// paths — gets the same routing. Pre-2026-05-04 TeamHandler.Expand +// hardcoded provisionWorkspace (Docker) and silently broke the +// "deploy a team on SaaS" flow: child workspace rows were created with +// no EC2 instance, the runtime never ran, and the 600s sweeper logged +// the misleading "container started but never called /registry/register". +// +// Architectural principle: templates own runtime/config/prompts/files/ +// plugins; the platform owns where it runs. Anything that picks +// between CP and local Docker belongs in this one helper. +func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool { + if h.cpProv != nil { + go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) + return true + } + if h.provisioner != nil { + go h.provisionWorkspace(workspaceID, templatePath, configFiles, payload) + return true + } + return false +} + // SetEnvMutators wires a provisionhook.Registry into the handler. Plugins // living in separate repos register on the same Registry instance during // boot (see cmd/server/main.go) and main.go calls this setter once before @@ -521,12 +548,15 @@ func (h *WorkspaceHandler) Create(c *gin.Context) { configFiles = h.ensureDefaultConfig(id, payload) } - // Auto-provision — pick backend: control plane (SaaS) or Docker (self-hosted) - if h.cpProv != nil { - go h.provisionWorkspaceCP(id, templatePath, configFiles, payload) - } else if h.provisioner != nil { - go h.provisionWorkspace(id, templatePath, configFiles, payload) - } else { + // Auto-provision — pick backend: control plane (SaaS) or Docker (self-hosted). + // Routing is centralized in provisionWorkspaceAuto so every caller + // (Create, TeamHandler.Expand, future paths) gets the same backend + // selection. Pre-2026-05-04 the team-deploy path hardcoded the + // Docker route, so on a SaaS tenant 7-of-7 sub-agents were created + // as DB rows but had no EC2 — symptom: "container started but never + // called /registry/register" + diagnose returns "docker client not + // configured". Centralizing here closes that drift class. + if !h.provisionWorkspaceAuto(id, templatePath, configFiles, payload) { // No Docker available (SaaS tenant). Persist basic config as JSON // so the Config tab shows the correct runtime/model/name. Then mark // the workspace as failed with a clear message. diff --git a/workspace-server/internal/handlers/workspace_provision_auto_test.go b/workspace-server/internal/handlers/workspace_provision_auto_test.go new file mode 100644 index 00000000..2a435658 --- /dev/null +++ b/workspace-server/internal/handlers/workspace_provision_auto_test.go @@ -0,0 +1,170 @@ +package handlers + +// Pins the backend-dispatcher invariant added 2026-05-04. +// +// Before the fix, TeamHandler.Expand hardcoded the Docker provisioner +// (provisionWorkspace), so on a SaaS tenant where the workspace-server +// has no docker socket, child workspaces were created as DB rows but +// never got an EC2 instance. The 600s sweeper then logged the misleading +// "container started but never called /registry/register". +// +// The fix centralizes backend selection in +// WorkspaceHandler.provisionWorkspaceAuto and routes both Create and +// TeamHandler.Expand through it. These tests pin: +// +// 1. Auto returns false when neither backend is wired (caller must +// persist + mark-failed itself). +// 2. Auto picks CP when cpProv is set. +// 3. team.go uses provisionWorkspaceAuto, not provisionWorkspace +// directly (source-level guard against the original drift). + +import ( + "bytes" + "context" + "errors" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" +) + +// trackingCPProv records every Start() call in a thread-safe slice. +// Defined locally to avoid coupling this test to the recordingCPProv +// in workspace_provision_concurrent_repro_test.go (whose Stop/etc. +// methods panic — fine there, would be noise here). +type trackingCPProv struct { + mu sync.Mutex + started []string + startErr error +} + +func (r *trackingCPProv) Start(_ context.Context, cfg provisioner.WorkspaceConfig) (string, error) { + r.mu.Lock() + r.started = append(r.started, cfg.WorkspaceID) + r.mu.Unlock() + if r.startErr != nil { + return "", r.startErr + } + return "i-stub-" + cfg.WorkspaceID, nil +} +func (r *trackingCPProv) Stop(_ context.Context, _ string) error { return nil } +func (r *trackingCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) { + return "", nil +} +func (r *trackingCPProv) IsRunning(_ context.Context, _ string) (bool, error) { return true, nil } + +func (r *trackingCPProv) startedSnapshot() []string { + r.mu.Lock() + defer r.mu.Unlock() + out := make([]string, len(r.started)) + copy(out, r.started) + return out +} + +// TestProvisionWorkspaceAuto_NoBackendReturnsFalse — when neither +// cpProv nor provisioner is wired, the dispatcher returns false so the +// caller knows it must own the persist + mark-failed path. Pre-fix, +// TeamHandler had no equivalent fallback at all and silently dropped +// children on the floor. +func TestProvisionWorkspaceAuto_NoBackendReturnsFalse(t *testing.T) { + bcast := &concurrentSafeBroadcaster{} + h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir()) + // Do NOT call SetCPProvisioner — both backends nil. + + ok := h.provisionWorkspaceAuto("ws-noback", "", nil, models.CreateWorkspacePayload{ + Name: "noback", Tier: 1, Runtime: "claude-code", + }) + if ok { + t.Fatalf("expected provisionWorkspaceAuto to return false with no backend wired") + } +} + +// TestProvisionWorkspaceAuto_RoutesToCPWhenSet — when cpProv is set +// (SaaS tenant), Auto MUST route there. CP wins because per-workspace +// EC2 is the SaaS path; Docker would silently fail "no docker socket" +// on the tenant EC2. +// +// This is the regression-prevention test for the Design Director bug +// where 7-of-7 sub-agents went down the Docker path on SaaS. +func TestProvisionWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) { + mock := setupTestDB(t) + mock.MatchExpectationsInOrder(false) + + // provisionWorkspaceCP runs in the goroutine and will hit: + // secrets SELECTs + UPDATE workspace as failed (because we make + // CP Start return an error to short-circuit the rest of the path). + mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`). + WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM workspace_secrets`). + WithArgs(sqlmock.AnyArg()). + WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + mock.ExpectExec(`UPDATE workspaces SET status =`). + WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + rec := &trackingCPProv{startErr: errors.New("simulated CP rejection")} + bcast := &concurrentSafeBroadcaster{} + h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir()) + h.SetCPProvisioner(rec) + + wsID := "ws-routes-to-cp-0123456789abcdef" + ok := h.provisionWorkspaceAuto(wsID, "", nil, models.CreateWorkspacePayload{ + Name: "test", Tier: 1, Runtime: "claude-code", + }) + if !ok { + t.Fatalf("expected provisionWorkspaceAuto to return true with CP wired") + } + + // Wait for the goroutine to land in cpProv.Start (or give up). + deadline := time.Now().Add(2 * time.Second) + for { + if len(rec.startedSnapshot()) > 0 { + break + } + if time.Now().After(deadline) { + t.Fatalf("timed out waiting for cpProv.Start; recorded=%v", rec.startedSnapshot()) + } + time.Sleep(20 * time.Millisecond) + } + + got := rec.startedSnapshot() + if len(got) != 1 || got[0] != wsID { + t.Errorf("expected cpProv.Start invoked once with %q, got %v", wsID, got) + } +} + +// TestTeamExpand_UsesAutoNotDirectDockerPath — source-level guard: if +// a future refactor reintroduces a hardcoded `h.wh.provisionWorkspace` +// call in team.go, this fails. Pre-fix the hardcoded call was the bug. +// +// Substring match on the source rather than AST because the failure +// shape is "wrong function name" — a plain text gate suffices. +// Per `feedback_behavior_based_ast_gates.md` we'd usually pin the +// behavior, but the behavior here ("calls dispatcher, not dispatcher's +// docker leg") is awkward to assert without standing up the entire +// Expand stack — the auto test above covers the dispatcher behavior; +// this test is the cheap source-level seatbelt for the call site. +func TestTeamExpand_UsesAutoNotDirectDockerPath(t *testing.T) { + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + src, err := os.ReadFile(filepath.Join(wd, "team.go")) + if err != nil { + t.Fatalf("read team.go: %v", err) + } + if bytes.Contains(src, []byte("h.wh.provisionWorkspace(")) { + t.Errorf("team.go calls h.wh.provisionWorkspace directly — must use h.wh.provisionWorkspaceAuto so SaaS tenants route to CP. " + + "Pre-2026-05-04 the direct call sent every team child down the Docker path on SaaS, " + + "creating workspace rows with no EC2 instance.") + } + if !bytes.Contains(src, []byte("h.wh.provisionWorkspaceAuto(")) { + t.Errorf("team.go must call h.wh.provisionWorkspaceAuto for child provisioning — current code does not") + } +}