chore(workspace-server): remove TeamHandler.Expand bulk-create handler

Every workspace can have children via the regular CreateWorkspace flow
with parent_id set, so a separate handler that bulk-creates from
config.yaml's sub_workspaces (and was non-idempotent — calling it twice
duplicated the team) earned its way out. "Team" is just the state of
having children; expanding/collapsing is purely a canvas-side visual
action that toggles the `collapsed` column via PATCH.

The non-idempotency directly caused tenant-hongming's vCPU starvation:
72 distinct child workspaces accumulated in 4 days, ~14 leaked EC2s
(50 of 64 vCPU consumed by stale teams), every Canvas tabs E2E retry
flaking on RunInstances VcpuLimitExceeded.

What stays:
- TeamHandler.Collapse — still useful; stops + removes children via
  StopWorkspaceAuto. Reachable from the canvas Collapse Team button.
  (Note: that button currently calls PATCH /workspaces/:id, not the
  Collapse endpoint — that's a separate reachability question for
  later.)
- findTemplateDirByName helper — kept in team.go pending a relocate
  decision; no in-package consumers after Expand.
- The four other paths that create child workspaces continue to work
  unchanged: regular POST /workspaces with parent_id, OrgHandler.Import
  (recursive tree), Bundle import, scripts.

What goes:
- POST /workspaces/:id/expand route (router.go)
- TeamHandler.Expand method (team.go: ~130 lines)
- 4 TestTeamExpand_* sqlmock tests (team_test.go)
- TestTeamExpand_UsesAutoNotDirectDockerPath AST gate
  (workspace_provision_auto_test.go) — pinned a code path that no
  longer exists; the generic TestNoCallSiteCallsDirectProvisionerExceptAuto
  gate still covers the architectural intent for any future caller.

Follow-up PRs:
- canvas/ContextMenu.tsx: drop the "Expand to Team" right-click button
  + handleExpand callback; users create children via the regular
  + New Workspace dialog with the parent picker (already supported)
- OrgHandler.Import idempotency (skip-if-exists OR replace_if_exists)
  — same bug class as the deleted Expand, but on the bulk-tree path
- One-off cleanup script for tenant-hongming's 72 stale workspaces

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-05-05 02:39:13 -07:00
parent 118b8e47ad
commit ec1f21922c
4 changed files with 26 additions and 339 deletions

View File

@ -1,7 +1,6 @@
package handlers
import (
"database/sql"
"encoding/json"
"log"
"net/http"
@ -12,165 +11,26 @@ import (
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
"gopkg.in/yaml.v3"
)
// TeamHandler delegates child-workspace provisioning to wh so child
// workspaces go through the same prepare/mint/preflight pipeline that
// every other provision path uses. Pre-fix (issue #2367): Expand
// directly invoked h.provisioner.Start which skipped mintWorkspaceSecrets,
// leaving every team-expanded child with NULL platform_inbound_secret +
// NULL auth_token — same drift class as the SaaS bug fixed in #2366.
// TeamHandler now hosts only Collapse — the visual "expand" action is
// canvas-side and creating children goes through the regular
// WorkspaceHandler.Create path with parent_id set, like any other
// workspace. Every workspace can have children; "team" is just the
// state of having children. The old Expand handler bulk-created
// children by reading sub_workspaces from a parent's config and was
// non-idempotent — calling it N times leaked N×children EC2s, which
// is how tenant-hongming accumulated 72 stale workspaces.
type TeamHandler struct {
broadcaster *events.Broadcaster
wh *WorkspaceHandler
platformURL string
configsDir string
wh *WorkspaceHandler
b *events.Broadcaster
}
// NewTeamHandler constructs a TeamHandler. Backend selection (Docker vs
// CP) goes through h.wh.StopWorkspaceAuto + h.wh.provisionWorkspaceAuto;
// no per-handler provisioner field is needed here.
// NewTeamHandler constructs a TeamHandler. wh is used by Collapse to
// route StopWorkspaceAuto through the backend dispatcher.
func NewTeamHandler(b *events.Broadcaster, wh *WorkspaceHandler, platformURL, configsDir string) *TeamHandler {
return &TeamHandler{
broadcaster: b,
wh: wh,
platformURL: platformURL,
configsDir: configsDir,
}
}
// Expand handles POST /workspaces/:id/expand
// Reads sub_workspaces from the workspace's config and provisions child workspaces.
func (h *TeamHandler) Expand(c *gin.Context) {
parentID := c.Param("id")
ctx := c.Request.Context()
// Verify workspace exists and is online
var name string
var tier int
var status string
err := db.DB.QueryRowContext(ctx,
`SELECT name, tier, status FROM workspaces WHERE id = $1`, parentID,
).Scan(&name, &tier, &status)
if err == sql.ErrNoRows {
c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
return
}
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "lookup failed"})
return
}
// Find the workspace's config to get sub_workspaces
templateDir := findTemplateDirByName(h.configsDir, name)
if templateDir == "" {
c.JSON(http.StatusBadRequest, gin.H{"error": "no config found for workspace"})
return
}
configData, err := os.ReadFile(filepath.Join(templateDir, "config.yaml"))
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read config"})
return
}
var config struct {
SubWorkspaces []struct {
Config string `yaml:"config"`
Name string `yaml:"name"`
Role string `yaml:"role"`
} `yaml:"sub_workspaces"`
}
if err := yaml.Unmarshal(configData, &config); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to parse config"})
return
}
if len(config.SubWorkspaces) == 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "workspace has no sub_workspaces defined in config"})
return
}
// Create child workspaces
children := make([]map[string]interface{}, 0)
for _, sub := range config.SubWorkspaces {
childID := uuid.New().String()
childName := sub.Name
if childName == "" {
childName = sub.Config
}
_, err := db.DB.ExecContext(ctx, `
INSERT INTO workspaces (id, name, role, tier, status, parent_id)
VALUES ($1, $2, $3, $4, 'provisioning', $5)
`, childID, childName, nilStr(sub.Role), tier, parentID)
if err != nil {
log.Printf("Expand: failed to create child %s: %v", childName, err)
continue
}
// Insert canvas layout (offset from parent)
if _, err := db.DB.ExecContext(ctx, `
INSERT INTO canvas_layouts (workspace_id, x, y) VALUES ($1, $2, $3)
`, childID, 0, 0); err != nil {
log.Printf("Team expand: failed to insert layout for child %s: %v", childID, err)
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", childID, map[string]interface{}{
"name": childName,
"tier": tier,
"parent_id": parentID,
})
// Delegate child-workspace provisioning to the shared
// provision pipeline. Issue #2367: previously Expand called
// h.provisioner.Start directly, bypassing mintWorkspaceSecrets
// and every other preflight (secrets, env mutators, identity
// injection, missing-env). That left every child with NULL
// platform_inbound_secret and never-issued auth_token. Now
// children go through the same provisionWorkspaceAuto path as
// Create/Restart, so adding a future provision-time step
// automatically covers Expand too.
//
// 2026-05-04 follow-up: switched from provisionWorkspace
// (hardcoded Docker) to provisionWorkspaceAuto (picks CP for
// SaaS, Docker for self-hosted). Pre-fix, deploying a team on
// a SaaS tenant created child rows but never an EC2 instance —
// the 600s sweeper logged the misleading "container started
// but never called /registry/register". Templates only own
// shape (config/prompts/files/plugins/runtime); the platform
// owns where it runs.
if h.wh != nil && sub.Config != "" {
templatePath := filepath.Join(h.configsDir, sub.Config)
if _, err := os.Stat(templatePath); err == nil {
parent := parentID // copy for closure
h.wh.provisionWorkspaceAuto(childID, templatePath, nil, models.CreateWorkspacePayload{
Name: childName,
Role: sub.Role,
Tier: tier,
ParentID: &parent,
})
}
}
children = append(children, map[string]interface{}{
"id": childID,
"name": childName,
"role": sub.Role,
})
}
// Mark parent as expanded
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_EXPANDED", parentID, map[string]interface{}{
"children": children,
})
c.JSON(http.StatusOK, gin.H{
"status": "expanded",
"children": children,
})
return &TeamHandler{wh: wh, b: b}
}
// Collapse handles POST /workspaces/:id/collapse
@ -215,12 +75,12 @@ func (h *TeamHandler) Collapse(c *gin.Context) {
log.Printf("Team collapse: failed to delete layout for %s: %v", childID, err)
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", childID, map[string]interface{}{})
h.b.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", childID, map[string]interface{}{})
removed = append(removed, childName)
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_COLLAPSED", parentID, map[string]interface{}{
h.b.RecordAndBroadcast(ctx, "WORKSPACE_COLLAPSED", parentID, map[string]interface{}{
"removed_children": removed,
})
@ -230,13 +90,12 @@ func (h *TeamHandler) Collapse(c *gin.Context) {
})
}
func nilStr(s string) interface{} {
if s == "" {
return nil
}
return s
}
// findTemplateDirByName resolves a workspace name to its template
// directory. Kept here because callers outside this package may use
// it, even though the in-package consumer (Expand) is gone.
//
// TODO: relocate alongside the templates handler if no other callers
// surface, or delete entirely after a deprecation cycle.
func findTemplateDirByName(configsDir, name string) string {
normalized := normalizeName(name)
@ -265,7 +124,6 @@ func findTemplateDirByName(configsDir, name string) string {
if json.Unmarshal(data, &cfg) == nil && cfg.Name == name {
return filepath.Join(configsDir, e.Name())
}
// Try yaml unmarshal too
if yaml.Unmarshal(data, &cfg) == nil && cfg.Name == name {
return filepath.Join(configsDir, e.Name())
}

View File

@ -1,7 +1,6 @@
package handlers
import (
"bytes"
"encoding/json"
"net/http"
"net/http/httptest"
@ -13,21 +12,6 @@ import (
"github.com/gin-gonic/gin"
)
// makeTeamConfigDir creates a temporary configs directory with a named
// subdirectory containing a config.yaml file.
func makeTeamConfigDir(t *testing.T, workspaceName string, yamlContent string) string {
t.Helper()
dir := t.TempDir()
subDir := filepath.Join(dir, workspaceName)
if err := os.MkdirAll(subDir, 0755); err != nil {
t.Fatalf("failed to create config dir: %v", err)
}
if err := os.WriteFile(filepath.Join(subDir, "config.yaml"), []byte(yamlContent), 0644); err != nil {
t.Fatalf("failed to write config.yaml: %v", err)
}
return dir
}
// ---------- TeamHandler: Collapse ----------
func TestTeamCollapse_NoChildren(t *testing.T) {
@ -116,134 +100,6 @@ func TestTeamCollapse_WithChildren(t *testing.T) {
t.Errorf("expected 2 removed children, got %v", resp["removed"])
}
}
// ---------- TeamHandler: Expand ----------
func TestTeamExpand_WorkspaceNotFound(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewTeamHandler(newTestBroadcaster(), NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", "/tmp/configs")
mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
WithArgs("ws-missing").
WillReturnError(sqlmock.ErrCancelled)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-missing"}}
c.Request = httptest.NewRequest("POST", "/", nil)
handler.Expand(c)
if w.Code != http.StatusInternalServerError {
t.Errorf("expected 500, got %d", w.Code)
}
}
func TestTeamExpand_NoConfigFound(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewTeamHandler(newTestBroadcaster(), NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", t.TempDir())
mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
WithArgs("ws-1").
WillReturnRows(sqlmock.NewRows([]string{"name", "tier", "status"}).
AddRow("UnknownAgent", 1, "online"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
c.Request = httptest.NewRequest("POST", "/", nil)
handler.Expand(c)
if w.Code != http.StatusBadRequest {
t.Errorf("expected 400, got %d: %s", w.Code, w.Body.String())
}
}
func TestTeamExpand_EmptySubWorkspaces(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
configDir := makeTeamConfigDir(t, "myagent", "name: MyAgent\nsub_workspaces: []\n")
handler := NewTeamHandler(newTestBroadcaster(), NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", configDir)
mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
WithArgs("ws-1").
WillReturnRows(sqlmock.NewRows([]string{"name", "tier", "status"}).
AddRow("myagent", 1, "online"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
c.Request = httptest.NewRequest("POST", "/", nil)
handler.Expand(c)
if w.Code != http.StatusBadRequest {
t.Errorf("expected 400 (no sub_workspaces), got %d: %s", w.Code, w.Body.String())
}
}
func TestTeamExpand_WithSubWorkspaces(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
yaml := `name: TeamLead
sub_workspaces:
- name: Worker-A
role: data-analyst
- name: Worker-B
role: code-reviewer
`
configDir := makeTeamConfigDir(t, "teamlead", yaml)
handler := NewTeamHandler(broadcaster, NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", configDir)
mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
WithArgs("ws-lead").
WillReturnRows(sqlmock.NewRows([]string{"name", "tier", "status"}).
AddRow("teamlead", 2, "online"))
// INSERT for Worker-A
mock.ExpectExec("INSERT INTO workspaces").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO canvas_layouts").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO structure_events").
WillReturnResult(sqlmock.NewResult(0, 1))
// INSERT for Worker-B
mock.ExpectExec("INSERT INTO workspaces").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO canvas_layouts").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO structure_events").
WillReturnResult(sqlmock.NewResult(0, 1))
// WORKSPACE_EXPANDED broadcast
mock.ExpectExec("INSERT INTO structure_events").
WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-lead"}}
c.Request = httptest.NewRequest("POST", "/", bytes.NewBufferString(""))
handler.Expand(c)
if w.Code != http.StatusOK {
t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
}
var resp map[string]interface{}
json.Unmarshal(w.Body.Bytes(), &resp)
children, ok := resp["children"].([]interface{})
if !ok || len(children) != 2 {
t.Errorf("expected 2 children, got %v", resp["children"])
}
}
// ---------- findTemplateDirByName helper ----------
func TestFindTemplateDirByName_DirectMatch(t *testing.T) {

View File

@ -172,36 +172,6 @@ func TestProvisionWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) {
}
}
// TestTeamExpand_UsesAutoNotDirectDockerPath — source-level guard: if
// a future refactor reintroduces a hardcoded `h.wh.provisionWorkspace`
// call in team.go, this fails. Pre-fix the hardcoded call was the bug.
//
// Substring match on the source rather than AST because the failure
// shape is "wrong function name" — a plain text gate suffices.
// Per `feedback_behavior_based_ast_gates.md` we'd usually pin the
// behavior, but the behavior here ("calls dispatcher, not dispatcher's
// docker leg") is awkward to assert without standing up the entire
// Expand stack — the auto test above covers the dispatcher behavior;
// this test is the cheap source-level seatbelt for the call site.
func TestTeamExpand_UsesAutoNotDirectDockerPath(t *testing.T) {
wd, err := os.Getwd()
if err != nil {
t.Fatalf("getwd: %v", err)
}
src, err := os.ReadFile(filepath.Join(wd, "team.go"))
if err != nil {
t.Fatalf("read team.go: %v", err)
}
if bytes.Contains(src, []byte("h.wh.provisionWorkspace(")) {
t.Errorf("team.go calls h.wh.provisionWorkspace directly — must use h.wh.provisionWorkspaceAuto so SaaS tenants route to CP. " +
"Pre-2026-05-04 the direct call sent every team child down the Docker path on SaaS, " +
"creating workspace rows with no EC2 instance.")
}
if !bytes.Contains(src, []byte("h.wh.provisionWorkspaceAuto(")) {
t.Errorf("team.go must call h.wh.provisionWorkspaceAuto for child provisioning — current code does not")
}
}
// TestNoCallSiteCallsDirectProvisionerExceptAuto — generic source-level
// gate covering ANY future caller, not just team.go and org_import.go.
//

View File

@ -242,9 +242,12 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
// entire platform. Gated behind AdminAuth (issue #180).
r.GET("/approvals/pending", middleware.AdminAuth(db.DB), apph.ListAll)
// Team Expansion
// Team handlers — Collapse only. The bulk-Expand path is gone:
// every workspace can have children via the regular CreateWorkspace
// flow with parent_id set, so a separate handler that bulk-creates
// from sub_workspaces (and was non-idempotent — calling it twice
// duplicated the team) earned its way out.
teamh := handlers.NewTeamHandler(broadcaster, wh, platformURL, configsDir)
wsAuth.POST("/expand", teamh.Expand)
wsAuth.POST("/collapse", teamh.Collapse)
// Agents