Merge pull request #2817 from Molecule-AI/staging

staging → main: auto-promote 856c967
This commit is contained in:
molecule-ai[bot] 2026-05-04 19:21:07 -07:00 committed by GitHub
commit 9a835ef631
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 234 additions and 47 deletions

View File

@ -302,7 +302,7 @@ jobs:
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/canary-cleanup.code 2>/dev/null
-d "{\"confirm\":\"$slug\"}" >/tmp/canary-cleanup.code
set -e
code=$(cat /tmp/canary-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then

View File

@ -199,7 +199,7 @@ jobs:
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/canvas-cleanup.code 2>/dev/null
-d "{\"confirm\":\"$slug\"}" >/tmp/canvas-cleanup.code
set -e
code=$(cat /tmp/canvas-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then

View File

@ -166,7 +166,7 @@ jobs:
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/external-cleanup.code 2>/dev/null
-d "{\"confirm\":\"$slug\"}" >/tmp/external-cleanup.code
set -e
code=$(cat /tmp/external-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then

View File

@ -231,7 +231,7 @@ jobs:
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/saas-cleanup.code 2>/dev/null
-d "{\"confirm\":\"$slug\"}" >/tmp/saas-cleanup.code
set -e
code=$(cat /tmp/saas-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then

View File

@ -155,7 +155,7 @@ jobs:
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/sanity-cleanup.code 2>/dev/null
-d "{\"confirm\":\"$slug\"}" >/tmp/sanity-cleanup.code
set -e
code=$(cat /tmp/sanity-cleanup.code 2>/dev/null || echo "000")
if [ "$code" = "200" ] || [ "$code" = "204" ]; then

View File

@ -200,8 +200,11 @@ jobs:
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" >"$HTTP_CODE_FILE" 2>/dev/null
-d "$BODY" >"$HTTP_CODE_FILE"
set -e
# Stderr from curl (e.g. dial errors with -sS) goes to the runner
# log so operators can see WHY a connection failed. Stdout is
# captured to $HTTP_CODE_FILE because that's where -w writes.
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"

View File

@ -160,8 +160,10 @@ jobs:
-H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" >"$HTTP_CODE_FILE" 2>/dev/null
-d "$BODY" >"$HTTP_CODE_FILE"
set -e
# Stderr from curl (-sS shows dial errors etc.) goes to the
# runner log so operators can see WHY a connection failed.
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"

View File

@ -167,8 +167,9 @@ jobs:
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" >/tmp/del_code 2>/dev/null
-d "{\"confirm\":\"$slug\"}" >/tmp/del_code
set -e
# Stderr from curl (-sS shows dial errors etc.) goes to runner log.
http_code=$(cat /tmp/del_code 2>/dev/null || echo "000")
if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
deleted=$((deleted+1))

View File

@ -163,7 +163,7 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
if wsRuntime == "external" {
return false
}
if h.provisioner == nil && h.cpProv == nil {
if !h.HasProvisioner() {
return false
}

View File

@ -175,8 +175,18 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_ONLINE", id, map[string]interface{}{
"name": ws.Name, "external": true,
})
} else if h.provisioner != nil {
// Provision container
} else if h.workspace.HasProvisioner() {
// Provision container — either backend (CP for SaaS, local Docker
// for self-hosted) is fine. Pre-2026-05-05 this gate was
// `h.provisioner != nil`, which only checked the Docker pointer
// and silently dropped every workspace on a SaaS tenant: the prep
// block was skipped, no Auto call ever fired, and the row sat in
// 'provisioning' until the 600s sweeper marked it failed with the
// misleading "container started but never called /registry/register"
// (incident: hongming tenant org-import 2026-05-05 01:14, 7-of-7
// claude-code workspaces stuck). Routing to the right backend
// happens inside provisionWorkspaceAuto — this gate just decides
// whether to do prep at all.
payload := models.CreateWorkspacePayload{
Name: ws.Name, Tier: tier, Runtime: runtime, Model: model,
WorkspaceDir: ws.WorkspaceDir,

View File

@ -112,21 +112,43 @@ func (h *WorkspaceHandler) SetCPProvisioner(cp provisioner.CPProvisionerAPI) {
h.cpProv = cp
}
// HasProvisioner reports whether either backend (CP or local Docker) is
// wired. Callers that gate prep-work on "do we have something that can
// provision a container?" should use this rather than direct field access
// to either provisioner — those individual checks miss the SaaS path
// (cpProv set, provisioner nil) or the self-hosted path (provisioner set,
// cpProv nil) symmetrically. Org-import + future bulk paths gate their
// template/config/secret prep on this so the work isn't wasted on
// deployments where no backend is available.
func (h *WorkspaceHandler) HasProvisioner() bool {
return h.cpProv != nil || h.provisioner != nil
}
// provisionWorkspaceAuto picks the backend (CP for SaaS, local Docker
// for self-hosted) and starts provisioning in a goroutine. Returns true
// when a backend was kicked off, false when neither is wired (caller
// owns the persist-config + mark-failed surface in that case).
// when a backend was kicked off, false when neither is wired.
//
// Centralized so every caller — Create, TeamHandler.Expand, future
// paths — gets the same routing. Pre-2026-05-04 TeamHandler.Expand
// hardcoded provisionWorkspace (Docker) and silently broke the
// "deploy a team on SaaS" flow: child workspace rows were created with
// no EC2 instance, the runtime never ran, and the 600s sweeper logged
// the misleading "container started but never called /registry/register".
// Single source of truth for "start provisioning a workspace" across
// every caller (Create, OrgHandler.createWorkspaceTree, TeamHandler.Expand,
// future paths). Centralized routing here means callers don't repeat
// the "Docker vs CP" decision and can't drift on it.
//
// Self-marks-failed on the no-backend path: pre-2026-05-05 the false
// return was silent, and any caller that forgot to handle it (TeamHandler
// pre-#2367, OrgHandler.createWorkspaceTree pre-this-fix) silently
// dropped workspaces — they sat in 'provisioning' for 10 min until the
// sweeper marked them failed with the misleading "container started but
// never called /registry/register" message. Marking failed inside Auto
// closes that class: even if a future caller bypasses HasProvisioner
// gating or ignores the bool return, the workspace ends in a clean
// failed state with an actionable error message.
//
// Architectural principle: templates own runtime/config/prompts/files/
// plugins; the platform owns where it runs. Anything that picks
// between CP and local Docker belongs in this one helper.
// between CP and local Docker belongs in this one helper. Anything
// post-routing-but-pre-Start (mint secrets, render template, etc.)
// lives in prepareProvisionContext (shared by both per-backend
// goroutines).
func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
if h.cpProv != nil {
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
@ -136,6 +158,15 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri
go h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
return true
}
// No backend wired — mark failed so the workspace doesn't linger in
// 'provisioning' for the full 10-minute sweep window. 10s is enough
// for the broadcast + single UPDATE inside markProvisionFailed.
log.Printf("provisionWorkspaceAuto: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
h.markProvisionFailed(failCtx, workspaceID,
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
nil)
return false
}
@ -565,29 +596,22 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
}
// Auto-provision — pick backend: control plane (SaaS) or Docker (self-hosted).
// Routing is centralized in provisionWorkspaceAuto so every caller
// (Create, TeamHandler.Expand, future paths) gets the same backend
// selection. Pre-2026-05-04 the team-deploy path hardcoded the
// Docker route, so on a SaaS tenant 7-of-7 sub-agents were created
// as DB rows but had no EC2 — symptom: "container started but never
// called /registry/register" + diagnose returns "docker client not
// configured". Centralizing here closes that drift class.
// Routing AND the no-backend mark-failed path are both inside
// provisionWorkspaceAuto (single source of truth). The Create-specific
// extra is the workspace_config UPSERT below: when no backend is
// wired, Auto marks the row failed but doesn't persist the bare
// runtime/model/tier as JSON — the Config tab needs that to render
// even on failed workspaces, so Create owns this Create-only side
// effect rather than coupling Auto to a UI concern.
if !h.provisionWorkspaceAuto(id, templatePath, configFiles, payload) {
// No Docker available (SaaS tenant). Persist basic config as JSON
// so the Config tab shows the correct runtime/model/name. Then mark
// the workspace as failed with a clear message.
cfgJSON := fmt.Sprintf(`{"name":%q,"runtime":%q,"tier":%d,"template":%q}`,
payload.Name, payload.Runtime, payload.Tier, payload.Template)
db.DB.ExecContext(ctx, `
if _, err := db.DB.ExecContext(ctx, `
INSERT INTO workspace_config (workspace_id, data) VALUES ($1, $2::jsonb)
ON CONFLICT (workspace_id) DO UPDATE SET data = $2::jsonb
`, id, cfgJSON)
db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = $1, last_sample_error = 'Docker not available — workspace containers require a Docker daemon or external provisioning.', updated_at = now() WHERE id = $2`, models.StatusFailed, id)
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{
"error": "Docker not available on this platform instance",
})
log.Printf("Create: no Docker daemon — workspace %s config persisted, marked failed", id)
`, id, cfgJSON); err != nil {
log.Printf("Create: workspace_config persist failed for %s: %v", id, err)
}
}
c.JSON(http.StatusCreated, gin.H{

View File

@ -67,12 +67,27 @@ func (r *trackingCPProv) startedSnapshot() []string {
return out
}
// TestProvisionWorkspaceAuto_NoBackendReturnsFalse — when neither
// cpProv nor provisioner is wired, the dispatcher returns false so the
// caller knows it must own the persist + mark-failed path. Pre-fix,
// TeamHandler had no equivalent fallback at all and silently dropped
// children on the floor.
func TestProvisionWorkspaceAuto_NoBackendReturnsFalse(t *testing.T) {
// TestProvisionWorkspaceAuto_NoBackendMarksFailed — when neither cpProv
// nor provisioner is wired, the dispatcher must:
// 1. Return false (so the caller can do its own extra cleanup if
// needed — Create persists workspace_config for the Config tab).
// 2. Mark the workspace failed via markProvisionFailed (defense in
// depth: if a future caller bypasses the bool return, the workspace
// still doesn't sit stuck in 'provisioning' for 10 min until the
// sweeper fires).
//
// Pre-2026-05-05 the false return was silent and TeamHandler /
// OrgHandler.createWorkspaceTree dropped workspaces on the floor when
// they ignored it. This test pins the new contract that Auto owns the
// failed-mark on no-backend.
func TestProvisionWorkspaceAuto_NoBackendMarksFailed(t *testing.T) {
mock := setupTestDB(t)
mock.MatchExpectationsInOrder(false)
// markProvisionFailed does a single UPDATE workspaces ... SET status='failed'.
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
bcast := &concurrentSafeBroadcaster{}
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
// Do NOT call SetCPProvisioner — both backends nil.
@ -83,6 +98,9 @@ func TestProvisionWorkspaceAuto_NoBackendReturnsFalse(t *testing.T) {
if ok {
t.Fatalf("expected provisionWorkspaceAuto to return false with no backend wired")
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("expected markProvisionFailed UPDATE to fire on no-backend path: %v", err)
}
}
// TestProvisionWorkspaceAuto_RoutesToCPWhenSet — when cpProv is set
@ -285,3 +303,132 @@ func TestOrgImport_UsesAutoNotDirectDockerPath(t *testing.T) {
t.Errorf("org_import.go must call h.workspace.provisionWorkspaceAuto for child provisioning — current code does not")
}
}
// TestHasProvisioner_TrueOnCPOnly — SaaS tenants run with cpProv set and
// the local Docker provisioner nil. HasProvisioner must report true so
// gate-y callers (org-import prep block) don't skip provisioning.
//
// Pre-2026-05-05 the org-import gate checked `h.provisioner != nil`
// directly — false on SaaS — and the entire provisioning prep block was
// skipped. The Auto call inside the block was unreachable; PR #2798's
// "route through Auto" fix didn't help because the gate fired earlier.
// Symptom: 7-workspace org-import on hongming sat in 'provisioning' for
// the full 10-minute sweep window.
func TestHasProvisioner_TrueOnCPOnly(t *testing.T) {
bcast := &concurrentSafeBroadcaster{}
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
h.SetCPProvisioner(&trackingCPProv{})
if !h.HasProvisioner() {
t.Errorf("HasProvisioner() == false with cpProv wired (Docker nil) — every gate that uses this would skip provisioning on SaaS, reproducing the hongming 7-workspace stuck-in-provisioning incident from 2026-05-05")
}
}
// TestHasProvisioner_TrueOnDockerOnly — self-hosted operators run with
// the local Docker provisioner wired and cpProv nil. HasProvisioner must
// report true.
func TestHasProvisioner_TrueOnDockerOnly(t *testing.T) {
bcast := &concurrentSafeBroadcaster{}
// NewWorkspaceHandler guards the typed-nil-interface trap (workspace.go
// docstring) — pass a real *Provisioner stub via the test fixture
// rather than a nil pointer cast to the interface.
h := NewWorkspaceHandler(bcast, &provisioner.Provisioner{}, "http://localhost:8080", t.TempDir())
if !h.HasProvisioner() {
t.Errorf("HasProvisioner() == false with Docker wired (cpProv nil) — would break self-hosted operators")
}
}
// TestHasProvisioner_FalseWhenNeitherWired — misconfigured deployment
// with neither backend reachable. HasProvisioner must report false so
// the org-import prep block is skipped (no point doing template/secret
// prep work when nothing can run the resulting container).
func TestHasProvisioner_FalseWhenNeitherWired(t *testing.T) {
bcast := &concurrentSafeBroadcaster{}
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
if h.HasProvisioner() {
t.Errorf("HasProvisioner() == true with no backend wired — gate should short-circuit and not waste prep cycles")
}
}
// TestNoBareBothNilCheck — source-level pin: any code that wants to ask
// "is no backend wired?" must use !HasProvisioner(), not the verbose
// `h.provisioner == nil && h.cpProv == nil` shape. Two reasons:
//
// 1. Single source of truth — when a third backend lands (k8s,
// containerd, whatever), HasProvisioner gets the new field added in
// one place. Bare both-nil checks each need to be hunted down.
// 2. Symmetry — easier to read `!h.HasProvisioner()` and know the
// intent than to mentally evaluate `nil && nil`.
//
// Allowed exception: workspace.go's HasProvisioner() definition itself.
// Test files are also exempt — assertions on internal field state are
// fine.
func TestNoBareBothNilCheck(t *testing.T) {
wd, err := os.Getwd()
if err != nil {
t.Fatalf("getwd: %v", err)
}
entries, err := os.ReadDir(wd)
if err != nil {
t.Fatalf("readdir: %v", err)
}
bareShapes := []string{
"h.provisioner == nil && h.cpProv == nil",
"h.cpProv == nil && h.provisioner == nil",
}
for _, entry := range entries {
name := entry.Name()
if filepath.Ext(name) != ".go" {
continue
}
// Allow tests (legitimate field-state assertions).
if len(name) > len("_test.go") &&
name[len(name)-len("_test.go"):] == "_test.go" {
continue
}
// workspace.go houses HasProvisioner's definition + can reference
// the fields directly — but with the !HasProvisioner() refactor
// it shouldn't contain the bare both-nil shape any more.
src, err := os.ReadFile(filepath.Join(wd, name))
if err != nil {
t.Fatalf("read %s: %v", name, err)
}
for _, needle := range bareShapes {
if bytes.Contains(src, []byte(needle)) {
t.Errorf("%s contains bare `%s` — must use `!h.HasProvisioner()` for SSOT.", name, needle)
}
}
}
}
// TestOrgImportGate_UsesHasProvisionerNotBareField — source-level pin
// for the org-import gate. Pre-fix the gate read `h.provisioner != nil`,
// which checked only the Docker pointer and silently dropped every
// SaaS workspace. The fix routes through HasProvisioner so both
// backends count.
//
// Substring match because the failure shape is "wrong field" — a plain
// text gate suffices, same rationale as TestTeamExpand_UsesAutoNotDirectDockerPath
// above.
func TestOrgImportGate_UsesHasProvisionerNotBareField(t *testing.T) {
wd, err := os.Getwd()
if err != nil {
t.Fatalf("getwd: %v", err)
}
src, err := os.ReadFile(filepath.Join(wd, "org_import.go"))
if err != nil {
t.Fatalf("read org_import.go: %v", err)
}
// The provisioning gate is the `else if ...` clause that follows the
// `if ws.External {` external-workspace branch. If org_import.go
// reintroduces a bare `h.provisioner` check there, every SaaS tenant
// silently drops org-imported workspaces again. Auto's nil check is
// the right routing layer; the gate just decides whether to do prep
// work at all, and HasProvisioner is the symmetric question.
if bytes.Contains(src, []byte("} else if h.provisioner != nil {")) {
t.Errorf("org_import.go gates the provisioning prep block on `h.provisioner != nil` (bare Docker check) — must use `h.workspace.HasProvisioner()` so SaaS tenants (cpProv set, provisioner nil) reach the Auto call. " +
"Repro: 2026-05-05 hongming org-import incident — 7 claude-code workspaces stuck in 'provisioning' for 10 min because the gate skipped the entire block on SaaS, hiding the Auto call PR #2798 introduced.")
}
if !bytes.Contains(src, []byte("h.workspace.HasProvisioner()")) {
t.Errorf("org_import.go must call h.workspace.HasProvisioner() in the provisioning gate — current code does not")
}
}

View File

@ -115,7 +115,7 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
// available — previously only `provisioner` was checked, which broke
// restart entirely on every SaaS tenant (the workspace EC2 couldn't
// be terminated + relaunched, the endpoint 503'd on every try).
if h.provisioner == nil && h.cpProv == nil {
if !h.HasProvisioner() {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "provisioner not available"})
return
}
@ -360,7 +360,7 @@ func (h *WorkspaceHandler) RestartByID(workspaceID string) {
// reactive auto-restart on every SaaS tenant (where the local Docker
// provisioner is intentionally nil). The runRestartCycle below now
// branches on which one is set for the Stop call.
if h.provisioner == nil && h.cpProv == nil {
if !h.HasProvisioner() {
return
}
coalesceRestart(workspaceID, func() { h.runRestartCycle(workspaceID) })
@ -657,7 +657,7 @@ func (h *WorkspaceHandler) Resume(c *gin.Context) {
// Accept either provisioner (Docker self-hosted OR CP SaaS). See the
// same guard in Restart above for context — Resume previously 503'd
// on every SaaS tenant.
if h.provisioner == nil && h.cpProv == nil {
if !h.HasProvisioner() {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "provisioner not available"})
return
}