Merge pull request #2431 from Molecule-AI/fix/restart-async-stop

Move /restart Stop into the async goroutine
This commit is contained in:
Hongming Wang 2026-05-01 02:38:29 +00:00 committed by GitHub
commit 0c51df989b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 155 additions and 17 deletions

View File

@ -143,18 +143,6 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
} }
} }
// Stop existing container / terminate existing EC2. Pick the matching
// stop path. CPProvisioner.Stop calls DELETE /cp/workspaces/:id to
// terminate the workspace EC2; the subsequent provision call launches
// a fresh one with the latest secrets + config.
if h.provisioner != nil {
h.provisioner.Stop(ctx, id)
} else if h.cpProv != nil {
if err := h.cpProv.Stop(ctx, id); err != nil {
log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err)
}
}
// Reset to provisioning // Reset to provisioning
db.DB.ExecContext(ctx, db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = $1, url = '', updated_at = now() WHERE id = $2`, models.StatusProvisioning, id) `UPDATE workspaces SET status = $1, url = '', updated_at = now() WHERE id = $2`, models.StatusProvisioning, id)
@ -214,11 +202,33 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
// Dispatch to the correct provisioner. provisionWorkspaceOpts is the // Dispatch to the correct provisioner. provisionWorkspaceOpts is the
// Docker path; provisionWorkspaceCP is the SaaS path. The Create // Docker path; provisionWorkspaceCP is the SaaS path. The Create
// handler already branches this way; Restart now mirrors it. // handler already branches this way; Restart now mirrors it.
if h.cpProv != nil { //
go h.provisionWorkspaceCP(id, templatePath, configFiles, payload) // Stop runs inside this goroutine — NOT before the response — because
} else { // CPProvisioner.Stop is synchronous DELETE /cp/workspaces/:id →
go h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession) // CP → AWS EC2 terminate, which can exceed the canvas's 15s default
} // HTTP timeout when the platform has just redeployed (every tenant's
// CP request queues at once). Pre-fix the user saw a misleading
// "signal timed out" error on the canvas even though the restart
// actually succeeded — caught 2026-04-30 on hongmingwang hermes
// workspace 32993ee7-…cb9d75d112a5 right after the heartbeat-fix
// platform redeploy. Use context.Background() to detach from the
// request lifecycle so an aborted client connection doesn't cancel
// the in-flight Stop/provision pair.
go func() {
bgCtx := context.Background()
if h.provisioner != nil {
h.provisioner.Stop(bgCtx, id)
} else if h.cpProv != nil {
if err := h.cpProv.Stop(bgCtx, id); err != nil {
log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err)
}
}
if h.cpProv != nil {
h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
} else {
h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession)
}
}()
go h.sendRestartContext(id, restartData) go h.sendRestartContext(id, restartData)
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession}) c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession})

View File

@ -0,0 +1,128 @@
package handlers
// workspace_restart_async_test.go — behavior-based AST gate that pins
// the invariant introduced 2026-04-30 in fix/restart-async-stop:
//
// The /workspaces/:id/restart handler MUST NOT call provisioner.Stop
// or cpProv.Stop synchronously before returning the HTTP response.
//
// Why: CPProvisioner.Stop is `DELETE /cp/workspaces/:id?instance_id=...`
// → CP → AWS EC2 terminate. Right after a platform-wide redeploy (every
// tenant queues a CP request at once) this can exceed the canvas's 15s
// HTTP timeout, surfacing as a misleading "signal timed out" red banner
// even though the async re-provision goroutine continues and the restart
// actually succeeds. Caught on hongmingwang hermes workspace
// 32993ee7-…cb9d75d112a5 right after the heartbeat-fix redeploy.
//
// The fix moves Stop into the same goroutine as provisionWorkspaceCP /
// provisionWorkspaceOpts so the handler returns ~immediately (only the
// metadata lookup + DB UPDATE remain on the response path). This gate
// catches anyone re-introducing the pre-fix shape — easy to do because
// "stop, then provision" reads as natural ordering at the handler top
// level.
//
// Behavior-based: matches `<recv>.{provisioner,cpProv}.Stop(...)` calls
// regardless of variable names; only the AST shape matters. A future
// rename of either field is still pinned, same family as the
// callsProvisionStart gate in workspace_provision_shared_test.go.
import (
"go/ast"
"go/parser"
"go/token"
"path/filepath"
"testing"
)
// TestRestart_StopRunsInsideGoroutine asserts the /restart handler does
// not call provisioner.Stop / cpProv.Stop at the top level of its body.
// Any such call must be nested inside a *ast.FuncLit (the `go func() {
// ... }()` block) so the HTTP response can return before Stop completes.
func TestRestart_StopRunsInsideGoroutine(t *testing.T) {
t.Parallel()
fset := token.NewFileSet()
f, err := parser.ParseFile(fset, filepath.Join(".", "workspace_restart.go"), nil, 0)
if err != nil {
t.Fatalf("parse workspace_restart.go: %v", err)
}
var restartFn *ast.FuncDecl
for _, decl := range f.Decls {
fn, ok := decl.(*ast.FuncDecl)
if !ok || fn.Body == nil {
continue
}
if fn.Name.Name == "Restart" && fn.Recv != nil {
restartFn = fn
break
}
}
if restartFn == nil {
t.Fatal("Restart method not found in workspace_restart.go — did the handler get renamed?")
}
// Walk the function body. For every call to <recv>.<provField>.Stop,
// record whether it appears at the top level (any ancestor frame is
// the Restart body) or inside a FuncLit (i.e. inside `go func() { ... }`).
type violation struct {
line int
field string
}
var topLevelStops []violation
var inspect func(node ast.Node, insideFuncLit bool)
inspect = func(node ast.Node, insideFuncLit bool) {
if node == nil {
return
}
switch n := node.(type) {
case *ast.FuncLit:
// Descend into the func literal body, but mark all
// children as inside-FuncLit. This is the goroutine
// boundary we want.
ast.Inspect(n, func(child ast.Node) bool {
if child == n {
return true
}
inspect(child, true)
return false
})
return
case *ast.CallExpr:
if sel, ok := n.Fun.(*ast.SelectorExpr); ok && sel.Sel.Name == "Stop" {
if inner, ok := sel.X.(*ast.SelectorExpr); ok {
switch inner.Sel.Name {
case "provisioner", "cpProv":
if !insideFuncLit {
topLevelStops = append(topLevelStops, violation{
line: fset.Position(n.Pos()).Line,
field: inner.Sel.Name,
})
}
}
}
}
}
// Continue walking children of non-FuncLit nodes.
ast.Inspect(node, func(child ast.Node) bool {
if child == node {
return true
}
inspect(child, insideFuncLit)
return false
})
}
inspect(restartFn.Body, false)
for _, v := range topLevelStops {
t.Errorf(
"workspace_restart.go:%d Restart calls h.%s.Stop synchronously at the top level. "+
"Stop must run inside the `go func() { ... }()` goroutine that wraps "+
"provisionWorkspaceCP/provisionWorkspaceOpts — otherwise the canvas's 15s "+
"HTTP timeout fires before the response, surfacing 'signal timed out' even "+
"when the restart actually succeeds. See fix/restart-async-stop (2026-04-30).",
v.line, v.field,
)
}
}