forked from molecule-ai/molecule-core
refactor(handlers): extract dispatchers from workspace.go (#2800 partial)
workspace.go was 950 lines after the dispatcher work in PRs #2811 + #2824 + #2843 + #2846 + #2847 + #2848 + #2850. This extracts the 6 SoT dispatcher helpers into a new workspace_dispatchers.go so the file is the architectural unit it deserves to be (one place for "how do we route a workspace lifecycle verb to a backend?"). Moved (no body changes — pure cut + paste with imports): - HasProvisioner (gate accessor) - provisionWorkspaceAuto (async provision) - provisionWorkspaceAutoSync (sync provision, runRestartCycle's path) - StopWorkspaceAuto (stop dispatcher) - RestartWorkspaceAuto (restart wrapper) - RestartWorkspaceAutoOpts (restart with resetClaudeSession) workspace.go shrinks from 950 → 735 lines and now holds: - WorkspaceHandler struct + constructor - SetCPProvisioner / SetEnvMutators - Create / List / Get / scanWorkspaceRow - HTTP handler glue workspace_dispatchers.go is 255 lines and holds the dispatcher trio + sync variant + gate accessor + a header docblock summarizing the history (PRs that added each helper) and the source-level pin tests that gate against drift. Source-level pin tests updated: - TestNoCallSiteCallsDirectProvisionerExceptAuto: workspace_dispatchers.go added to allowlist (the dispatcher IS the place that calls per-backend bodies directly). - TestNoCallSiteCallsBareStop: same. - TestNoBareBothNilCheck / TestOrgImportGate_UsesHasProvisionerNotBareField: no change — they were source-pinning specific files, not all callers. Build clean, vet clean, full test suite passes (1742 / 0 in workspace, all Go test packages green). Out of scope (#2800 has more): - workspace_provision.go (869 lines) split into Docker + CP halves — files would still be 400+ each, marginal value. Defer until a third backend lands and the symmetry breaks. - Splitting Create / List / Get into per-handler files — they're short and tightly coupled to the struct; keep co-located. Closes #2800 partial. Filing a follow-up issue if/when workspace.go or workspace_provision.go grows past 800 lines again. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d21ac991c1
commit
024ef260db
@ -112,222 +112,6 @@ func (h *WorkspaceHandler) SetCPProvisioner(cp provisioner.CPProvisionerAPI) {
|
||||
h.cpProv = cp
|
||||
}
|
||||
|
||||
// HasProvisioner reports whether either backend (CP or local Docker) is
|
||||
// wired. Callers that gate prep-work on "do we have something that can
|
||||
// provision a container?" should use this rather than direct field access
|
||||
// to either provisioner — those individual checks miss the SaaS path
|
||||
// (cpProv set, provisioner nil) or the self-hosted path (provisioner set,
|
||||
// cpProv nil) symmetrically. Org-import + future bulk paths gate their
|
||||
// template/config/secret prep on this so the work isn't wasted on
|
||||
// deployments where no backend is available.
|
||||
func (h *WorkspaceHandler) HasProvisioner() bool {
|
||||
return h.cpProv != nil || h.provisioner != nil
|
||||
}
|
||||
|
||||
// provisionWorkspaceAuto picks the backend (CP for SaaS, local Docker
|
||||
// for self-hosted) and starts provisioning in a goroutine. Returns true
|
||||
// when a backend was kicked off, false when neither is wired.
|
||||
//
|
||||
// Single source of truth for "start provisioning a workspace" across
|
||||
// every caller (Create, OrgHandler.createWorkspaceTree, TeamHandler.Expand,
|
||||
// future paths). Centralized routing here means callers don't repeat
|
||||
// the "Docker vs CP" decision and can't drift on it.
|
||||
//
|
||||
// Self-marks-failed on the no-backend path: pre-2026-05-05 the false
|
||||
// return was silent, and any caller that forgot to handle it (TeamHandler
|
||||
// pre-#2367, OrgHandler.createWorkspaceTree pre-this-fix) silently
|
||||
// dropped workspaces — they sat in 'provisioning' for 10 min until the
|
||||
// sweeper marked them failed with the misleading "container started but
|
||||
// never called /registry/register" message. Marking failed inside Auto
|
||||
// closes that class: even if a future caller bypasses HasProvisioner
|
||||
// gating or ignores the bool return, the workspace ends in a clean
|
||||
// failed state with an actionable error message.
|
||||
//
|
||||
// Architectural principle: templates own runtime/config/prompts/files/
|
||||
// plugins; the platform owns where it runs. Anything that picks
|
||||
// between CP and local Docker belongs in this one helper. Anything
|
||||
// post-routing-but-pre-Start (mint secrets, render template, etc.)
|
||||
// lives in prepareProvisionContext (shared by both per-backend
|
||||
// goroutines).
|
||||
func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
|
||||
if h.cpProv != nil {
|
||||
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
go h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
// No backend wired — mark failed so the workspace doesn't linger in
|
||||
// 'provisioning' for the full 10-minute sweep window. 10s is enough
|
||||
// for the broadcast + single UPDATE inside markProvisionFailed.
|
||||
log.Printf("provisionWorkspaceAuto: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
|
||||
failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
h.markProvisionFailed(failCtx, workspaceID,
|
||||
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
|
||||
nil)
|
||||
return false
|
||||
}
|
||||
|
||||
// provisionWorkspaceAutoSync is the synchronous variant of
|
||||
// provisionWorkspaceAuto — it BLOCKS in the current goroutine until the
|
||||
// per-backend provision body returns, instead of spawning a goroutine.
|
||||
//
|
||||
// Used by callers that need to coordinate stop+provision as a pair and
|
||||
// can't return until provision is done — today that's runRestartCycle
|
||||
// (auto-restart cycle's pending-flag loop relies on synchronous return
|
||||
// to know when it's safe to start the next cycle without racing the
|
||||
// in-flight provision goroutine on the next iteration's Stop call).
|
||||
//
|
||||
// Backend selection + no-backend fallback are identical to
|
||||
// provisionWorkspaceAuto. The only difference is the goroutine wrapper.
|
||||
// Keep these two helpers in sync — when one grows a new arm (third
|
||||
// backend, retry semantics), the other should too.
|
||||
func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
|
||||
if h.cpProv != nil {
|
||||
h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
log.Printf("provisionWorkspaceAutoSync: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
|
||||
failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
h.markProvisionFailed(failCtx, workspaceID,
|
||||
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
|
||||
nil)
|
||||
return false
|
||||
}
|
||||
|
||||
// StopWorkspaceAuto picks the backend (CP for SaaS, local Docker for
|
||||
// self-hosted) and stops the workspace synchronously. Returns nil when
|
||||
// neither backend is wired (a workspace nobody is running can't be
|
||||
// stopped — that's a no-op, not an error).
|
||||
//
|
||||
// Single source of truth for "stop a workspace" — symmetric with
|
||||
// provisionWorkspaceAuto. Pre-2026-05-05 the stop side had no Auto
|
||||
// dispatcher and every caller wrote `if h.provisioner != nil { Stop }`,
|
||||
// which silently leaked EC2s on SaaS:
|
||||
// - team.go:208 (Collapse) — issue #2813
|
||||
// - workspace_crud.go:432 (stopAndRemove during Delete) — issue #2814
|
||||
//
|
||||
// Both bugs reproduced for ~6 months. The pattern is the same drift
|
||||
// class as the org-import provision bug closed by PR #2811.
|
||||
//
|
||||
// Why CP wins when both are wired (matching provisionWorkspaceAuto):
|
||||
// production runs exactly one backend at a time — a SaaS tenant has
|
||||
// cpProv set + provisioner nil; a self-hosted operator has provisioner
|
||||
// set + cpProv nil. The "both set" case only arises in test fixtures,
|
||||
// and the CP-wins ordering matches how Auto picks for provisioning so
|
||||
// the test stubs stay on a single side.
|
||||
//
|
||||
// Volume cleanup (workspace_crud.go) stays Docker-only — CP-managed
|
||||
// workspaces have no volumes to clean. Callers that need that extra
|
||||
// step keep their `if h.provisioner != nil { RemoveVolume(...) }`
|
||||
// gate AFTER calling StopWorkspaceAuto. The abstraction here is "stop
|
||||
// the running workload," not "tear down all state."
|
||||
func (h *WorkspaceHandler) StopWorkspaceAuto(ctx context.Context, workspaceID string) error {
|
||||
if h.cpProv != nil {
|
||||
return h.cpProv.Stop(ctx, workspaceID)
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
return h.provisioner.Stop(ctx, workspaceID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RestartWorkspaceAuto stops the running workload (with retry semantics
|
||||
// tuned for the restart hot path) then starts provisioning again, in a
|
||||
// detached goroutine. Returns true when a backend was kicked off, false
|
||||
// when neither is wired (caller owns the persist + mark-failed surface
|
||||
// in that case — symmetric with provisionWorkspaceAuto's bool return).
|
||||
//
|
||||
// Single source of truth for "restart a workspace" — third in the
|
||||
// dispatcher trio alongside provisionWorkspaceAuto and StopWorkspaceAuto.
|
||||
// Phase 1 of #2799 introduces this helper + migrates one caller; the
|
||||
// remaining workspace_restart.go sites (Restart HTTP handler goroutine,
|
||||
// Resume handler, Pause loop) follow in Phase 2/3 because they need
|
||||
// async-context reasoning beyond a fire-and-return dispatcher.
|
||||
//
|
||||
// Retry on the Stop leg is intentional and distinguishes this from
|
||||
// StopWorkspaceAuto:
|
||||
//
|
||||
// - StopWorkspaceAuto (Stop-on-delete contract): no retry, no-backend
|
||||
// is a silent no-op. Different verb, different stakes — a workspace
|
||||
// nobody is running can't be stopped.
|
||||
//
|
||||
// - RestartWorkspaceAuto: bounded exponential backoff on cpProv.Stop
|
||||
// via cpStopWithRetry. Restart's contract is "make the workspace
|
||||
// alive again" — refusing to reprovision when Stop fails strands
|
||||
// the user with a dead workspace and no recovery path other than
|
||||
// manual canvas intervention. Retry absorbs the transient CP/AWS
|
||||
// hiccups that cause most EC2-leak-adjacent incidents. On final
|
||||
// exhaustion, cpStopWithRetry logs LEAK-SUSPECT and proceeds with
|
||||
// reprovision regardless, bridging to the orphan reconciler.
|
||||
//
|
||||
// Docker provisioner.Stop has no retry — a local container that fails
|
||||
// to stop is a local infrastructure problem (OOM, resource pressure)
|
||||
// and retries won't help; the subsequent provision attempt will surface
|
||||
// the underlying daemon failure.
|
||||
//
|
||||
// Architectural note: this helper encapsulates the stop+reprovision
|
||||
// pair. The "which backend for stop" and "which backend for provision"
|
||||
// decisions live here and stay in sync (CP-stop pairs with CP-provision;
|
||||
// Docker-stop pairs with Docker-provision). Callers that need only the
|
||||
// stop half use StopWorkspaceAuto (delete path) or stopForRestart
|
||||
// (restart-path internal helper) directly.
|
||||
//
|
||||
// Payload requirements: caller MUST construct payload from the live
|
||||
// workspace row (name, runtime, tier, model, workspace_dir, etc.) so
|
||||
// the reprovision comes up with the workspace's actual configuration.
|
||||
// runRestartCycle does this synchronously (line ~538) before delegating
|
||||
// — match that pattern in any new caller.
|
||||
func (h *WorkspaceHandler) RestartWorkspaceAuto(ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
|
||||
return h.RestartWorkspaceAutoOpts(ctx, workspaceID, templatePath, configFiles, payload, false)
|
||||
}
|
||||
|
||||
// RestartWorkspaceAutoOpts is the variant that carries Docker-only
|
||||
// per-invocation knobs that don't fit on CreateWorkspacePayload. Today
|
||||
// the only such knob is resetClaudeSession (issue #12 — clears the
|
||||
// in-container Claude session before restart so the agent comes up
|
||||
// fresh). CP doesn't have a session-reset concept (each EC2 boots from
|
||||
// a fresh image), so the flag is silently ignored on the CP path.
|
||||
//
|
||||
// Most callers should call RestartWorkspaceAuto (resetClaudeSession=
|
||||
// false). The Restart HTTP handler is the one site that exposes the
|
||||
// flag to operators — it reads ?reset_session=true from the query
|
||||
// string when an operator wants to force a fresh session.
|
||||
func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) bool {
|
||||
// Stop leg first. CP-first ordering matches the other dispatchers
|
||||
// (provisionWorkspaceAuto, StopWorkspaceAuto) and the convention
|
||||
// documented in docs/architecture/backends.md.
|
||||
if h.cpProv != nil {
|
||||
h.cpStopWithRetry(ctx, workspaceID, "RestartWorkspaceAuto")
|
||||
// resetClaudeSession is Docker-only — CP has no session state to clear.
|
||||
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
// Docker.Stop has no retry — see docstring rationale.
|
||||
h.provisioner.Stop(ctx, workspaceID)
|
||||
go h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession)
|
||||
return true
|
||||
}
|
||||
// No backend wired — same shape as provisionWorkspaceAuto's no-backend
|
||||
// arm. Mark the workspace failed so the user sees a meaningful state
|
||||
// rather than a hang. 10s context lets markProvisionFailed broadcast
|
||||
// + UPDATE; the original ctx may already be cancelled.
|
||||
log.Printf("RestartWorkspaceAuto: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
|
||||
failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
h.markProvisionFailed(failCtx, workspaceID,
|
||||
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
|
||||
nil)
|
||||
return false
|
||||
}
|
||||
|
||||
// SetEnvMutators wires a provisionhook.Registry into the handler. Plugins
|
||||
// living in separate repos register on the same Registry instance during
|
||||
|
||||
255
workspace-server/internal/handlers/workspace_dispatchers.go
Normal file
255
workspace-server/internal/handlers/workspace_dispatchers.go
Normal file
@ -0,0 +1,255 @@
|
||||
package handlers
|
||||
|
||||
// workspace_dispatchers.go — Single-source-of-truth dispatchers for the
|
||||
// workspace lifecycle verbs (Create / Stop / Restart). Each helper picks
|
||||
// the right backend (Docker for self-hosted, CP for SaaS) and either
|
||||
// runs the per-backend body in a goroutine or synchronously, depending
|
||||
// on caller need.
|
||||
//
|
||||
// The dispatchers are the architectural boundary between handler code
|
||||
// (HTTP / orchestration) and per-backend implementations
|
||||
// (workspace_provision.go for Docker + CP). Source-level pin tests in
|
||||
// workspace_provision_auto_test.go enforce that handlers route through
|
||||
// these helpers rather than calling the per-backend bodies directly —
|
||||
// see TestNoCallSiteCallsDirectProvisionerExceptAuto, TestNoCallSiteCallsBareStop,
|
||||
// TestNoBareBothNilCheck, TestOrgImportGate_UsesHasProvisionerNotBareField.
|
||||
//
|
||||
// Architectural docs: docs/architecture/backends.md.
|
||||
//
|
||||
// History:
|
||||
// - PR #2811 introduced provisionWorkspaceAuto + HasProvisioner gate
|
||||
// (closed the org-import SaaS-skip silent-drop bug class).
|
||||
// - PR #2824 added StopWorkspaceAuto (closed the team-collapse +
|
||||
// workspace-delete EC2-leak class — issues #2813, #2814).
|
||||
// - PR #2843 + #2846 + #2847 + #2848 added RestartWorkspaceAuto +
|
||||
// RestartWorkspaceAutoOpts + provisionWorkspaceAutoSync and
|
||||
// migrated the four workspace_restart.go dispatch sites.
|
||||
// - This file extracts the helpers from workspace.go so the dispatcher
|
||||
// trio + sync variant + gate accessor are visually co-located,
|
||||
// making it easier for the next contributor to find and add a new
|
||||
// lifecycle verb without inlining dispatch logic.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
)
|
||||
|
||||
// HasProvisioner reports whether either backend (CP or local Docker) is
|
||||
// wired. Callers that gate prep-work on "do we have something that can
|
||||
// provision a container?" should use this rather than direct field access
|
||||
// to either provisioner — those individual checks miss the SaaS path
|
||||
// (cpProv set, provisioner nil) or the self-hosted path (provisioner set,
|
||||
// cpProv nil) symmetrically. Org-import + future bulk paths gate their
|
||||
// template/config/secret prep on this so the work isn't wasted on
|
||||
// deployments where no backend is available.
|
||||
func (h *WorkspaceHandler) HasProvisioner() bool {
|
||||
return h.cpProv != nil || h.provisioner != nil
|
||||
}
|
||||
|
||||
// provisionWorkspaceAuto picks the backend (CP for SaaS, local Docker
|
||||
// for self-hosted) and starts provisioning in a goroutine. Returns true
|
||||
// when a backend was kicked off, false when neither is wired.
|
||||
//
|
||||
// Single source of truth for "start provisioning a workspace" across
|
||||
// every caller (Create, OrgHandler.createWorkspaceTree, TeamHandler.Expand,
|
||||
// future paths). Centralized routing here means callers don't repeat
|
||||
// the "Docker vs CP" decision and can't drift on it.
|
||||
//
|
||||
// Self-marks-failed on the no-backend path: pre-2026-05-05 the false
|
||||
// return was silent, and any caller that forgot to handle it (TeamHandler
|
||||
// pre-#2367, OrgHandler.createWorkspaceTree pre-this-fix) silently
|
||||
// dropped workspaces — they sat in 'provisioning' for 10 min until the
|
||||
// sweeper marked them failed with the misleading "container started but
|
||||
// never called /registry/register" message. Marking failed inside Auto
|
||||
// closes that class: even if a future caller bypasses HasProvisioner
|
||||
// gating or ignores the bool return, the workspace ends in a clean
|
||||
// failed state with an actionable error message.
|
||||
//
|
||||
// Architectural principle: templates own runtime/config/prompts/files/
|
||||
// plugins; the platform owns where it runs. Anything that picks
|
||||
// between CP and local Docker belongs in this one helper. Anything
|
||||
// post-routing-but-pre-Start (mint secrets, render template, etc.)
|
||||
// lives in prepareProvisionContext (shared by both per-backend
|
||||
// goroutines).
|
||||
func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
|
||||
if h.cpProv != nil {
|
||||
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
go h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
// No backend wired — mark failed so the workspace doesn't linger in
|
||||
// 'provisioning' for the full 10-minute sweep window. 10s is enough
|
||||
// for the broadcast + single UPDATE inside markProvisionFailed.
|
||||
log.Printf("provisionWorkspaceAuto: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
|
||||
failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
h.markProvisionFailed(failCtx, workspaceID,
|
||||
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
|
||||
nil)
|
||||
return false
|
||||
}
|
||||
|
||||
// provisionWorkspaceAutoSync is the synchronous variant of
|
||||
// provisionWorkspaceAuto — it BLOCKS in the current goroutine until the
|
||||
// per-backend provision body returns, instead of spawning a goroutine.
|
||||
//
|
||||
// Used by callers that need to coordinate stop+provision as a pair and
|
||||
// can't return until provision is done — today that's runRestartCycle
|
||||
// (auto-restart cycle's pending-flag loop relies on synchronous return
|
||||
// to know when it's safe to start the next cycle without racing the
|
||||
// in-flight provision goroutine on the next iteration's Stop call).
|
||||
//
|
||||
// Backend selection + no-backend fallback are identical to
|
||||
// provisionWorkspaceAuto. The only difference is the goroutine wrapper.
|
||||
// Keep these two helpers in sync — when one grows a new arm (third
|
||||
// backend, retry semantics), the other should too.
|
||||
func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
|
||||
if h.cpProv != nil {
|
||||
h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
log.Printf("provisionWorkspaceAutoSync: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
|
||||
failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
h.markProvisionFailed(failCtx, workspaceID,
|
||||
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
|
||||
nil)
|
||||
return false
|
||||
}
|
||||
|
||||
// StopWorkspaceAuto picks the backend (CP for SaaS, local Docker for
|
||||
// self-hosted) and stops the workspace synchronously. Returns nil when
|
||||
// neither backend is wired (a workspace nobody is running can't be
|
||||
// stopped — that's a no-op, not an error).
|
||||
//
|
||||
// Single source of truth for "stop a workspace" — symmetric with
|
||||
// provisionWorkspaceAuto. Pre-2026-05-05 the stop side had no Auto
|
||||
// dispatcher and every caller wrote `if h.provisioner != nil { Stop }`,
|
||||
// which silently leaked EC2s on SaaS:
|
||||
// - team.go:208 (Collapse) — issue #2813
|
||||
// - workspace_crud.go:432 (stopAndRemove during Delete) — issue #2814
|
||||
//
|
||||
// Both bugs reproduced for ~6 months. The pattern is the same drift
|
||||
// class as the org-import provision bug closed by PR #2811.
|
||||
//
|
||||
// Why CP wins when both are wired (matching provisionWorkspaceAuto):
|
||||
// production runs exactly one backend at a time — a SaaS tenant has
|
||||
// cpProv set + provisioner nil; a self-hosted operator has provisioner
|
||||
// set + cpProv nil. The "both set" case only arises in test fixtures,
|
||||
// and the CP-wins ordering matches how Auto picks for provisioning so
|
||||
// the test stubs stay on a single side.
|
||||
//
|
||||
// Volume cleanup (workspace_crud.go) stays Docker-only — CP-managed
|
||||
// workspaces have no volumes to clean. Callers that need that extra
|
||||
// step keep their `if h.provisioner != nil { RemoveVolume(...) }`
|
||||
// gate AFTER calling StopWorkspaceAuto. The abstraction here is "stop
|
||||
// the running workload," not "tear down all state."
|
||||
func (h *WorkspaceHandler) StopWorkspaceAuto(ctx context.Context, workspaceID string) error {
|
||||
if h.cpProv != nil {
|
||||
return h.cpProv.Stop(ctx, workspaceID)
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
return h.provisioner.Stop(ctx, workspaceID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RestartWorkspaceAuto stops the running workload (with retry semantics
|
||||
// tuned for the restart hot path) then starts provisioning again, in a
|
||||
// detached goroutine. Returns true when a backend was kicked off, false
|
||||
// when neither is wired (caller owns the persist + mark-failed surface
|
||||
// in that case — symmetric with provisionWorkspaceAuto's bool return).
|
||||
//
|
||||
// Single source of truth for "restart a workspace" — third in the
|
||||
// dispatcher trio alongside provisionWorkspaceAuto and StopWorkspaceAuto.
|
||||
// Phase 1 of #2799 introduces this helper + migrates one caller; the
|
||||
// remaining workspace_restart.go sites (Restart HTTP handler goroutine,
|
||||
// Resume handler, Pause loop) follow in Phase 2/3 because they need
|
||||
// async-context reasoning beyond a fire-and-return dispatcher.
|
||||
//
|
||||
// Retry on the Stop leg is intentional and distinguishes this from
|
||||
// StopWorkspaceAuto:
|
||||
//
|
||||
// - StopWorkspaceAuto (Stop-on-delete contract): no retry, no-backend
|
||||
// is a silent no-op. Different verb, different stakes — a workspace
|
||||
// nobody is running can't be stopped.
|
||||
//
|
||||
// - RestartWorkspaceAuto: bounded exponential backoff on cpProv.Stop
|
||||
// via cpStopWithRetry. Restart's contract is "make the workspace
|
||||
// alive again" — refusing to reprovision when Stop fails strands
|
||||
// the user with a dead workspace and no recovery path other than
|
||||
// manual canvas intervention. Retry absorbs the transient CP/AWS
|
||||
// hiccups that cause most EC2-leak-adjacent incidents. On final
|
||||
// exhaustion, cpStopWithRetry logs LEAK-SUSPECT and proceeds with
|
||||
// reprovision regardless, bridging to the orphan reconciler.
|
||||
//
|
||||
// Docker provisioner.Stop has no retry — a local container that fails
|
||||
// to stop is a local infrastructure problem (OOM, resource pressure)
|
||||
// and retries won't help; the subsequent provision attempt will surface
|
||||
// the underlying daemon failure.
|
||||
//
|
||||
// Architectural note: this helper encapsulates the stop+reprovision
|
||||
// pair. The "which backend for stop" and "which backend for provision"
|
||||
// decisions live here and stay in sync (CP-stop pairs with CP-provision;
|
||||
// Docker-stop pairs with Docker-provision). Callers that need only the
|
||||
// stop half use StopWorkspaceAuto (delete path) or stopForRestart
|
||||
// (restart-path internal helper) directly.
|
||||
//
|
||||
// Payload requirements: caller MUST construct payload from the live
|
||||
// workspace row (name, runtime, tier, model, workspace_dir, etc.) so
|
||||
// the reprovision comes up with the workspace's actual configuration.
|
||||
// runRestartCycle does this synchronously (line ~538) before delegating
|
||||
// — match that pattern in any new caller.
|
||||
func (h *WorkspaceHandler) RestartWorkspaceAuto(ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
|
||||
return h.RestartWorkspaceAutoOpts(ctx, workspaceID, templatePath, configFiles, payload, false)
|
||||
}
|
||||
|
||||
// RestartWorkspaceAutoOpts is the variant that carries Docker-only
|
||||
// per-invocation knobs that don't fit on CreateWorkspacePayload. Today
|
||||
// the only such knob is resetClaudeSession (issue #12 — clears the
|
||||
// in-container Claude session before restart so the agent comes up
|
||||
// fresh). CP doesn't have a session-reset concept (each EC2 boots from
|
||||
// a fresh image), so the flag is silently ignored on the CP path.
|
||||
//
|
||||
// Most callers should call RestartWorkspaceAuto (resetClaudeSession=
|
||||
// false). The Restart HTTP handler is the one site that exposes the
|
||||
// flag to operators — it reads ?reset_session=true from the query
|
||||
// string when an operator wants to force a fresh session.
|
||||
func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) bool {
|
||||
// Stop leg first. CP-first ordering matches the other dispatchers
|
||||
// (provisionWorkspaceAuto, StopWorkspaceAuto) and the convention
|
||||
// documented in docs/architecture/backends.md.
|
||||
if h.cpProv != nil {
|
||||
h.cpStopWithRetry(ctx, workspaceID, "RestartWorkspaceAuto")
|
||||
// resetClaudeSession is Docker-only — CP has no session state to clear.
|
||||
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
// Docker.Stop has no retry — see docstring rationale.
|
||||
h.provisioner.Stop(ctx, workspaceID)
|
||||
go h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession)
|
||||
return true
|
||||
}
|
||||
// No backend wired — same shape as provisionWorkspaceAuto's no-backend
|
||||
// arm. Mark the workspace failed so the user sees a meaningful state
|
||||
// rather than a hang. 10s context lets markProvisionFailed broadcast
|
||||
// + UPDATE; the original ctx may already be cancelled.
|
||||
log.Printf("RestartWorkspaceAuto: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
|
||||
failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
h.markProvisionFailed(failCtx, workspaceID,
|
||||
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
|
||||
nil)
|
||||
return false
|
||||
}
|
||||
@ -234,9 +234,14 @@ func TestNoCallSiteCallsDirectProvisionerExceptAuto(t *testing.T) {
|
||||
".provisionWorkspaceCP(",
|
||||
}
|
||||
allowedFiles := map[string]bool{
|
||||
// workspace.go DEFINES the methods + the Auto dispatcher; it's
|
||||
// allowed to reference them directly.
|
||||
// workspace.go holds the WorkspaceHandler struct + constructor.
|
||||
"workspace.go": true,
|
||||
// workspace_dispatchers.go IS the Auto dispatcher — calls the
|
||||
// per-backend bodies directly via `go h.provisionWorkspaceCP(...)`
|
||||
// / `go h.provisionWorkspace(...)`. The whole point of this gate
|
||||
// is "every OTHER caller routes through the dispatcher; the
|
||||
// dispatcher itself routes through the per-backend body".
|
||||
"workspace_dispatchers.go": true,
|
||||
// workspace_provision.go DEFINES the bodies of the direct
|
||||
// methods (and the Auto-internal call from CP-mode itself).
|
||||
"workspace_provision.go": true,
|
||||
@ -571,10 +576,11 @@ func TestNoCallSiteCallsBareStop(t *testing.T) {
|
||||
".cpProv.Stop(",
|
||||
}
|
||||
allowedFiles := map[string]bool{
|
||||
"workspace.go": true,
|
||||
"workspace_provision.go": true,
|
||||
"workspace_restart.go": true,
|
||||
"container_files.go": true,
|
||||
"workspace.go": true,
|
||||
"workspace_dispatchers.go": true,
|
||||
"workspace_provision.go": true,
|
||||
"workspace_restart.go": true,
|
||||
"container_files.go": true,
|
||||
}
|
||||
for _, entry := range entries {
|
||||
name := entry.Name()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user