From f6ddcf66ab69eaf50e64a8ae76fe191b120fde04 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 19:35:29 -0700 Subject: [PATCH] Move /restart Stop into the async goroutine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix Restart called provisioner.Stop / cpProv.Stop synchronously before returning the HTTP response. CPProvisioner.Stop is DELETE /cp/workspaces/:id → CP → AWS EC2 terminate, which can exceed the canvas's 15s HTTP timeout, especially right after a platform-wide redeploy when every tenant queues a CP request at once. The user sees a misleading "signal timed out" red banner on Save & Restart even though the async re-provision goroutine continues and the workspace ends up online. Caught 2026-04-30 on hongmingwang hermes workspace 32993ee7-…cb9d75d112a5 right after the heartbeat-fix platform redeploy at 02:11Z. The workspace came back online correctly; only the canvas response timed out. Fix moves Stop into the same goroutine as provisionWorkspaceCP / provisionWorkspaceOpts. The handler now responds in <500ms (DB lookup + status UPDATE only). Stop and provision keep their existing ordering inside the goroutine. Uses context.Background() to detach from the request lifecycle so an aborted client connection doesn't cancel the in-flight Stop/provision pair. Pinned by a behavior-based AST gate (workspace_restart_async_test.go): the test parses workspace_restart.go and walks the Restart function body, flagging any .{provisioner,cpProv}.Stop call that isn't nested in a *ast.FuncLit. Same family as callsProvisionStart in workspace_provision_shared_test.go. Verified the gate fails on the pre-fix shape (flags lines 151 and 153 — the original sync Stop calls). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/workspace_restart.go | 44 +++--- .../handlers/workspace_restart_async_test.go | 128 ++++++++++++++++++ 2 files changed, 155 insertions(+), 17 deletions(-) create mode 100644 workspace-server/internal/handlers/workspace_restart_async_test.go diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index c9cbd774..e5cdd5c5 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -143,18 +143,6 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { } } - // Stop existing container / terminate existing EC2. Pick the matching - // stop path. CPProvisioner.Stop calls DELETE /cp/workspaces/:id to - // terminate the workspace EC2; the subsequent provision call launches - // a fresh one with the latest secrets + config. - if h.provisioner != nil { - h.provisioner.Stop(ctx, id) - } else if h.cpProv != nil { - if err := h.cpProv.Stop(ctx, id); err != nil { - log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err) - } - } - // Reset to provisioning db.DB.ExecContext(ctx, `UPDATE workspaces SET status = $1, url = '', updated_at = now() WHERE id = $2`, models.StatusProvisioning, id) @@ -214,11 +202,33 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { // Dispatch to the correct provisioner. provisionWorkspaceOpts is the // Docker path; provisionWorkspaceCP is the SaaS path. The Create // handler already branches this way; Restart now mirrors it. - if h.cpProv != nil { - go h.provisionWorkspaceCP(id, templatePath, configFiles, payload) - } else { - go h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession) - } + // + // Stop runs inside this goroutine — NOT before the response — because + // CPProvisioner.Stop is synchronous DELETE /cp/workspaces/:id → + // CP → AWS EC2 terminate, which can exceed the canvas's 15s default + // HTTP timeout when the platform has just redeployed (every tenant's + // CP request queues at once). Pre-fix the user saw a misleading + // "signal timed out" error on the canvas even though the restart + // actually succeeded — caught 2026-04-30 on hongmingwang hermes + // workspace 32993ee7-…cb9d75d112a5 right after the heartbeat-fix + // platform redeploy. Use context.Background() to detach from the + // request lifecycle so an aborted client connection doesn't cancel + // the in-flight Stop/provision pair. + go func() { + bgCtx := context.Background() + if h.provisioner != nil { + h.provisioner.Stop(bgCtx, id) + } else if h.cpProv != nil { + if err := h.cpProv.Stop(bgCtx, id); err != nil { + log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err) + } + } + if h.cpProv != nil { + h.provisionWorkspaceCP(id, templatePath, configFiles, payload) + } else { + h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession) + } + }() go h.sendRestartContext(id, restartData) c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession}) diff --git a/workspace-server/internal/handlers/workspace_restart_async_test.go b/workspace-server/internal/handlers/workspace_restart_async_test.go new file mode 100644 index 00000000..ca237c30 --- /dev/null +++ b/workspace-server/internal/handlers/workspace_restart_async_test.go @@ -0,0 +1,128 @@ +package handlers + +// workspace_restart_async_test.go — behavior-based AST gate that pins +// the invariant introduced 2026-04-30 in fix/restart-async-stop: +// +// The /workspaces/:id/restart handler MUST NOT call provisioner.Stop +// or cpProv.Stop synchronously before returning the HTTP response. +// +// Why: CPProvisioner.Stop is `DELETE /cp/workspaces/:id?instance_id=...` +// → CP → AWS EC2 terminate. Right after a platform-wide redeploy (every +// tenant queues a CP request at once) this can exceed the canvas's 15s +// HTTP timeout, surfacing as a misleading "signal timed out" red banner +// even though the async re-provision goroutine continues and the restart +// actually succeeds. Caught on hongmingwang hermes workspace +// 32993ee7-…cb9d75d112a5 right after the heartbeat-fix redeploy. +// +// The fix moves Stop into the same goroutine as provisionWorkspaceCP / +// provisionWorkspaceOpts so the handler returns ~immediately (only the +// metadata lookup + DB UPDATE remain on the response path). This gate +// catches anyone re-introducing the pre-fix shape — easy to do because +// "stop, then provision" reads as natural ordering at the handler top +// level. +// +// Behavior-based: matches `.{provisioner,cpProv}.Stop(...)` calls +// regardless of variable names; only the AST shape matters. A future +// rename of either field is still pinned, same family as the +// callsProvisionStart gate in workspace_provision_shared_test.go. + +import ( + "go/ast" + "go/parser" + "go/token" + "path/filepath" + "testing" +) + +// TestRestart_StopRunsInsideGoroutine asserts the /restart handler does +// not call provisioner.Stop / cpProv.Stop at the top level of its body. +// Any such call must be nested inside a *ast.FuncLit (the `go func() { +// ... }()` block) so the HTTP response can return before Stop completes. +func TestRestart_StopRunsInsideGoroutine(t *testing.T) { + t.Parallel() + + fset := token.NewFileSet() + f, err := parser.ParseFile(fset, filepath.Join(".", "workspace_restart.go"), nil, 0) + if err != nil { + t.Fatalf("parse workspace_restart.go: %v", err) + } + + var restartFn *ast.FuncDecl + for _, decl := range f.Decls { + fn, ok := decl.(*ast.FuncDecl) + if !ok || fn.Body == nil { + continue + } + if fn.Name.Name == "Restart" && fn.Recv != nil { + restartFn = fn + break + } + } + if restartFn == nil { + t.Fatal("Restart method not found in workspace_restart.go — did the handler get renamed?") + } + + // Walk the function body. For every call to ..Stop, + // record whether it appears at the top level (any ancestor frame is + // the Restart body) or inside a FuncLit (i.e. inside `go func() { ... }`). + type violation struct { + line int + field string + } + var topLevelStops []violation + + var inspect func(node ast.Node, insideFuncLit bool) + inspect = func(node ast.Node, insideFuncLit bool) { + if node == nil { + return + } + switch n := node.(type) { + case *ast.FuncLit: + // Descend into the func literal body, but mark all + // children as inside-FuncLit. This is the goroutine + // boundary we want. + ast.Inspect(n, func(child ast.Node) bool { + if child == n { + return true + } + inspect(child, true) + return false + }) + return + case *ast.CallExpr: + if sel, ok := n.Fun.(*ast.SelectorExpr); ok && sel.Sel.Name == "Stop" { + if inner, ok := sel.X.(*ast.SelectorExpr); ok { + switch inner.Sel.Name { + case "provisioner", "cpProv": + if !insideFuncLit { + topLevelStops = append(topLevelStops, violation{ + line: fset.Position(n.Pos()).Line, + field: inner.Sel.Name, + }) + } + } + } + } + } + // Continue walking children of non-FuncLit nodes. + ast.Inspect(node, func(child ast.Node) bool { + if child == node { + return true + } + inspect(child, insideFuncLit) + return false + }) + } + inspect(restartFn.Body, false) + + for _, v := range topLevelStops { + t.Errorf( + "workspace_restart.go:%d Restart calls h.%s.Stop synchronously at the top level. "+ + "Stop must run inside the `go func() { ... }()` goroutine that wraps "+ + "provisionWorkspaceCP/provisionWorkspaceOpts — otherwise the canvas's 15s "+ + "HTTP timeout fires before the response, surfacing 'signal timed out' even "+ + "when the restart actually succeeds. See fix/restart-async-stop (2026-04-30).", + v.line, v.field, + ) + } +}