Merge pull request #2431 from Molecule-AI/fix/restart-async-stop
Move /restart Stop into the async goroutine
This commit is contained in:
commit
0c51df989b
@ -143,18 +143,6 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop existing container / terminate existing EC2. Pick the matching
|
|
||||||
// stop path. CPProvisioner.Stop calls DELETE /cp/workspaces/:id to
|
|
||||||
// terminate the workspace EC2; the subsequent provision call launches
|
|
||||||
// a fresh one with the latest secrets + config.
|
|
||||||
if h.provisioner != nil {
|
|
||||||
h.provisioner.Stop(ctx, id)
|
|
||||||
} else if h.cpProv != nil {
|
|
||||||
if err := h.cpProv.Stop(ctx, id); err != nil {
|
|
||||||
log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reset to provisioning
|
// Reset to provisioning
|
||||||
db.DB.ExecContext(ctx,
|
db.DB.ExecContext(ctx,
|
||||||
`UPDATE workspaces SET status = $1, url = '', updated_at = now() WHERE id = $2`, models.StatusProvisioning, id)
|
`UPDATE workspaces SET status = $1, url = '', updated_at = now() WHERE id = $2`, models.StatusProvisioning, id)
|
||||||
@ -214,11 +202,33 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
|
|||||||
// Dispatch to the correct provisioner. provisionWorkspaceOpts is the
|
// Dispatch to the correct provisioner. provisionWorkspaceOpts is the
|
||||||
// Docker path; provisionWorkspaceCP is the SaaS path. The Create
|
// Docker path; provisionWorkspaceCP is the SaaS path. The Create
|
||||||
// handler already branches this way; Restart now mirrors it.
|
// handler already branches this way; Restart now mirrors it.
|
||||||
if h.cpProv != nil {
|
//
|
||||||
go h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
|
// Stop runs inside this goroutine — NOT before the response — because
|
||||||
} else {
|
// CPProvisioner.Stop is synchronous DELETE /cp/workspaces/:id →
|
||||||
go h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession)
|
// CP → AWS EC2 terminate, which can exceed the canvas's 15s default
|
||||||
}
|
// HTTP timeout when the platform has just redeployed (every tenant's
|
||||||
|
// CP request queues at once). Pre-fix the user saw a misleading
|
||||||
|
// "signal timed out" error on the canvas even though the restart
|
||||||
|
// actually succeeded — caught 2026-04-30 on hongmingwang hermes
|
||||||
|
// workspace 32993ee7-…cb9d75d112a5 right after the heartbeat-fix
|
||||||
|
// platform redeploy. Use context.Background() to detach from the
|
||||||
|
// request lifecycle so an aborted client connection doesn't cancel
|
||||||
|
// the in-flight Stop/provision pair.
|
||||||
|
go func() {
|
||||||
|
bgCtx := context.Background()
|
||||||
|
if h.provisioner != nil {
|
||||||
|
h.provisioner.Stop(bgCtx, id)
|
||||||
|
} else if h.cpProv != nil {
|
||||||
|
if err := h.cpProv.Stop(bgCtx, id); err != nil {
|
||||||
|
log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if h.cpProv != nil {
|
||||||
|
h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
|
||||||
|
} else {
|
||||||
|
h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession)
|
||||||
|
}
|
||||||
|
}()
|
||||||
go h.sendRestartContext(id, restartData)
|
go h.sendRestartContext(id, restartData)
|
||||||
|
|
||||||
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession})
|
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession})
|
||||||
|
|||||||
@ -0,0 +1,128 @@
|
|||||||
|
package handlers
|
||||||
|
|
||||||
|
// workspace_restart_async_test.go — behavior-based AST gate that pins
|
||||||
|
// the invariant introduced 2026-04-30 in fix/restart-async-stop:
|
||||||
|
//
|
||||||
|
// The /workspaces/:id/restart handler MUST NOT call provisioner.Stop
|
||||||
|
// or cpProv.Stop synchronously before returning the HTTP response.
|
||||||
|
//
|
||||||
|
// Why: CPProvisioner.Stop is `DELETE /cp/workspaces/:id?instance_id=...`
|
||||||
|
// → CP → AWS EC2 terminate. Right after a platform-wide redeploy (every
|
||||||
|
// tenant queues a CP request at once) this can exceed the canvas's 15s
|
||||||
|
// HTTP timeout, surfacing as a misleading "signal timed out" red banner
|
||||||
|
// even though the async re-provision goroutine continues and the restart
|
||||||
|
// actually succeeds. Caught on hongmingwang hermes workspace
|
||||||
|
// 32993ee7-…cb9d75d112a5 right after the heartbeat-fix redeploy.
|
||||||
|
//
|
||||||
|
// The fix moves Stop into the same goroutine as provisionWorkspaceCP /
|
||||||
|
// provisionWorkspaceOpts so the handler returns ~immediately (only the
|
||||||
|
// metadata lookup + DB UPDATE remain on the response path). This gate
|
||||||
|
// catches anyone re-introducing the pre-fix shape — easy to do because
|
||||||
|
// "stop, then provision" reads as natural ordering at the handler top
|
||||||
|
// level.
|
||||||
|
//
|
||||||
|
// Behavior-based: matches `<recv>.{provisioner,cpProv}.Stop(...)` calls
|
||||||
|
// regardless of variable names; only the AST shape matters. A future
|
||||||
|
// rename of either field is still pinned, same family as the
|
||||||
|
// callsProvisionStart gate in workspace_provision_shared_test.go.
|
||||||
|
|
||||||
|
import (
|
||||||
|
"go/ast"
|
||||||
|
"go/parser"
|
||||||
|
"go/token"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestRestart_StopRunsInsideGoroutine asserts the /restart handler does
|
||||||
|
// not call provisioner.Stop / cpProv.Stop at the top level of its body.
|
||||||
|
// Any such call must be nested inside a *ast.FuncLit (the `go func() {
|
||||||
|
// ... }()` block) so the HTTP response can return before Stop completes.
|
||||||
|
func TestRestart_StopRunsInsideGoroutine(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
fset := token.NewFileSet()
|
||||||
|
f, err := parser.ParseFile(fset, filepath.Join(".", "workspace_restart.go"), nil, 0)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("parse workspace_restart.go: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var restartFn *ast.FuncDecl
|
||||||
|
for _, decl := range f.Decls {
|
||||||
|
fn, ok := decl.(*ast.FuncDecl)
|
||||||
|
if !ok || fn.Body == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if fn.Name.Name == "Restart" && fn.Recv != nil {
|
||||||
|
restartFn = fn
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if restartFn == nil {
|
||||||
|
t.Fatal("Restart method not found in workspace_restart.go — did the handler get renamed?")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Walk the function body. For every call to <recv>.<provField>.Stop,
|
||||||
|
// record whether it appears at the top level (any ancestor frame is
|
||||||
|
// the Restart body) or inside a FuncLit (i.e. inside `go func() { ... }`).
|
||||||
|
type violation struct {
|
||||||
|
line int
|
||||||
|
field string
|
||||||
|
}
|
||||||
|
var topLevelStops []violation
|
||||||
|
|
||||||
|
var inspect func(node ast.Node, insideFuncLit bool)
|
||||||
|
inspect = func(node ast.Node, insideFuncLit bool) {
|
||||||
|
if node == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
switch n := node.(type) {
|
||||||
|
case *ast.FuncLit:
|
||||||
|
// Descend into the func literal body, but mark all
|
||||||
|
// children as inside-FuncLit. This is the goroutine
|
||||||
|
// boundary we want.
|
||||||
|
ast.Inspect(n, func(child ast.Node) bool {
|
||||||
|
if child == n {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
inspect(child, true)
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
return
|
||||||
|
case *ast.CallExpr:
|
||||||
|
if sel, ok := n.Fun.(*ast.SelectorExpr); ok && sel.Sel.Name == "Stop" {
|
||||||
|
if inner, ok := sel.X.(*ast.SelectorExpr); ok {
|
||||||
|
switch inner.Sel.Name {
|
||||||
|
case "provisioner", "cpProv":
|
||||||
|
if !insideFuncLit {
|
||||||
|
topLevelStops = append(topLevelStops, violation{
|
||||||
|
line: fset.Position(n.Pos()).Line,
|
||||||
|
field: inner.Sel.Name,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Continue walking children of non-FuncLit nodes.
|
||||||
|
ast.Inspect(node, func(child ast.Node) bool {
|
||||||
|
if child == node {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
inspect(child, insideFuncLit)
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
}
|
||||||
|
inspect(restartFn.Body, false)
|
||||||
|
|
||||||
|
for _, v := range topLevelStops {
|
||||||
|
t.Errorf(
|
||||||
|
"workspace_restart.go:%d Restart calls h.%s.Stop synchronously at the top level. "+
|
||||||
|
"Stop must run inside the `go func() { ... }()` goroutine that wraps "+
|
||||||
|
"provisionWorkspaceCP/provisionWorkspaceOpts — otherwise the canvas's 15s "+
|
||||||
|
"HTTP timeout fires before the response, surfacing 'signal timed out' even "+
|
||||||
|
"when the restart actually succeeds. See fix/restart-async-stop (2026-04-30).",
|
||||||
|
v.line, v.field,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user