Merge pull request #2432 from Molecule-AI/staging
staging → main: auto-promote 0c51df9
This commit is contained in:
commit
ed29ad0d2a
@ -143,18 +143,6 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// Stop existing container / terminate existing EC2. Pick the matching
|
||||
// stop path. CPProvisioner.Stop calls DELETE /cp/workspaces/:id to
|
||||
// terminate the workspace EC2; the subsequent provision call launches
|
||||
// a fresh one with the latest secrets + config.
|
||||
if h.provisioner != nil {
|
||||
h.provisioner.Stop(ctx, id)
|
||||
} else if h.cpProv != nil {
|
||||
if err := h.cpProv.Stop(ctx, id); err != nil {
|
||||
log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Reset to provisioning
|
||||
db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = $1, url = '', updated_at = now() WHERE id = $2`, models.StatusProvisioning, id)
|
||||
@ -214,11 +202,33 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
|
||||
// Dispatch to the correct provisioner. provisionWorkspaceOpts is the
|
||||
// Docker path; provisionWorkspaceCP is the SaaS path. The Create
|
||||
// handler already branches this way; Restart now mirrors it.
|
||||
if h.cpProv != nil {
|
||||
go h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
|
||||
} else {
|
||||
go h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession)
|
||||
}
|
||||
//
|
||||
// Stop runs inside this goroutine — NOT before the response — because
|
||||
// CPProvisioner.Stop is synchronous DELETE /cp/workspaces/:id →
|
||||
// CP → AWS EC2 terminate, which can exceed the canvas's 15s default
|
||||
// HTTP timeout when the platform has just redeployed (every tenant's
|
||||
// CP request queues at once). Pre-fix the user saw a misleading
|
||||
// "signal timed out" error on the canvas even though the restart
|
||||
// actually succeeded — caught 2026-04-30 on hongmingwang hermes
|
||||
// workspace 32993ee7-…cb9d75d112a5 right after the heartbeat-fix
|
||||
// platform redeploy. Use context.Background() to detach from the
|
||||
// request lifecycle so an aborted client connection doesn't cancel
|
||||
// the in-flight Stop/provision pair.
|
||||
go func() {
|
||||
bgCtx := context.Background()
|
||||
if h.provisioner != nil {
|
||||
h.provisioner.Stop(bgCtx, id)
|
||||
} else if h.cpProv != nil {
|
||||
if err := h.cpProv.Stop(bgCtx, id); err != nil {
|
||||
log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err)
|
||||
}
|
||||
}
|
||||
if h.cpProv != nil {
|
||||
h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
|
||||
} else {
|
||||
h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession)
|
||||
}
|
||||
}()
|
||||
go h.sendRestartContext(id, restartData)
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession})
|
||||
|
||||
@ -0,0 +1,128 @@
|
||||
package handlers
|
||||
|
||||
// workspace_restart_async_test.go — behavior-based AST gate that pins
|
||||
// the invariant introduced 2026-04-30 in fix/restart-async-stop:
|
||||
//
|
||||
// The /workspaces/:id/restart handler MUST NOT call provisioner.Stop
|
||||
// or cpProv.Stop synchronously before returning the HTTP response.
|
||||
//
|
||||
// Why: CPProvisioner.Stop is `DELETE /cp/workspaces/:id?instance_id=...`
|
||||
// → CP → AWS EC2 terminate. Right after a platform-wide redeploy (every
|
||||
// tenant queues a CP request at once) this can exceed the canvas's 15s
|
||||
// HTTP timeout, surfacing as a misleading "signal timed out" red banner
|
||||
// even though the async re-provision goroutine continues and the restart
|
||||
// actually succeeds. Caught on hongmingwang hermes workspace
|
||||
// 32993ee7-…cb9d75d112a5 right after the heartbeat-fix redeploy.
|
||||
//
|
||||
// The fix moves Stop into the same goroutine as provisionWorkspaceCP /
|
||||
// provisionWorkspaceOpts so the handler returns ~immediately (only the
|
||||
// metadata lookup + DB UPDATE remain on the response path). This gate
|
||||
// catches anyone re-introducing the pre-fix shape — easy to do because
|
||||
// "stop, then provision" reads as natural ordering at the handler top
|
||||
// level.
|
||||
//
|
||||
// Behavior-based: matches `<recv>.{provisioner,cpProv}.Stop(...)` calls
|
||||
// regardless of variable names; only the AST shape matters. A future
|
||||
// rename of either field is still pinned, same family as the
|
||||
// callsProvisionStart gate in workspace_provision_shared_test.go.
|
||||
|
||||
import (
|
||||
"go/ast"
|
||||
"go/parser"
|
||||
"go/token"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestRestart_StopRunsInsideGoroutine asserts the /restart handler does
|
||||
// not call provisioner.Stop / cpProv.Stop at the top level of its body.
|
||||
// Any such call must be nested inside a *ast.FuncLit (the `go func() {
|
||||
// ... }()` block) so the HTTP response can return before Stop completes.
|
||||
func TestRestart_StopRunsInsideGoroutine(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fset := token.NewFileSet()
|
||||
f, err := parser.ParseFile(fset, filepath.Join(".", "workspace_restart.go"), nil, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("parse workspace_restart.go: %v", err)
|
||||
}
|
||||
|
||||
var restartFn *ast.FuncDecl
|
||||
for _, decl := range f.Decls {
|
||||
fn, ok := decl.(*ast.FuncDecl)
|
||||
if !ok || fn.Body == nil {
|
||||
continue
|
||||
}
|
||||
if fn.Name.Name == "Restart" && fn.Recv != nil {
|
||||
restartFn = fn
|
||||
break
|
||||
}
|
||||
}
|
||||
if restartFn == nil {
|
||||
t.Fatal("Restart method not found in workspace_restart.go — did the handler get renamed?")
|
||||
}
|
||||
|
||||
// Walk the function body. For every call to <recv>.<provField>.Stop,
|
||||
// record whether it appears at the top level (any ancestor frame is
|
||||
// the Restart body) or inside a FuncLit (i.e. inside `go func() { ... }`).
|
||||
type violation struct {
|
||||
line int
|
||||
field string
|
||||
}
|
||||
var topLevelStops []violation
|
||||
|
||||
var inspect func(node ast.Node, insideFuncLit bool)
|
||||
inspect = func(node ast.Node, insideFuncLit bool) {
|
||||
if node == nil {
|
||||
return
|
||||
}
|
||||
switch n := node.(type) {
|
||||
case *ast.FuncLit:
|
||||
// Descend into the func literal body, but mark all
|
||||
// children as inside-FuncLit. This is the goroutine
|
||||
// boundary we want.
|
||||
ast.Inspect(n, func(child ast.Node) bool {
|
||||
if child == n {
|
||||
return true
|
||||
}
|
||||
inspect(child, true)
|
||||
return false
|
||||
})
|
||||
return
|
||||
case *ast.CallExpr:
|
||||
if sel, ok := n.Fun.(*ast.SelectorExpr); ok && sel.Sel.Name == "Stop" {
|
||||
if inner, ok := sel.X.(*ast.SelectorExpr); ok {
|
||||
switch inner.Sel.Name {
|
||||
case "provisioner", "cpProv":
|
||||
if !insideFuncLit {
|
||||
topLevelStops = append(topLevelStops, violation{
|
||||
line: fset.Position(n.Pos()).Line,
|
||||
field: inner.Sel.Name,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Continue walking children of non-FuncLit nodes.
|
||||
ast.Inspect(node, func(child ast.Node) bool {
|
||||
if child == node {
|
||||
return true
|
||||
}
|
||||
inspect(child, insideFuncLit)
|
||||
return false
|
||||
})
|
||||
}
|
||||
inspect(restartFn.Body, false)
|
||||
|
||||
for _, v := range topLevelStops {
|
||||
t.Errorf(
|
||||
"workspace_restart.go:%d Restart calls h.%s.Stop synchronously at the top level. "+
|
||||
"Stop must run inside the `go func() { ... }()` goroutine that wraps "+
|
||||
"provisionWorkspaceCP/provisionWorkspaceOpts — otherwise the canvas's 15s "+
|
||||
"HTTP timeout fires before the response, surfacing 'signal timed out' even "+
|
||||
"when the restart actually succeeds. See fix/restart-async-stop (2026-04-30).",
|
||||
v.line, v.field,
|
||||
)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user