fix(bundle): markFailed sets last_sample_error + AST gate

Closes the bug class surfaced by Canvas E2E #2632: a workspace ends up status='failed' with last_sample_error=NULL, and operators (or the E2E poll loop) see the useless "Workspace failed: (no last_sample_error)" with no triage signal. Two pieces: 1. **bundle/importer.go markFailed** — the UPDATE was setting only status, leaving last_sample_error NULL. Same incident class as the silent-drop bugs in PRs #2811 + #2824, different code path. markProvisionFailed in workspace_provision_shared.go has set the message column for a long time; this writer drifted the convention. Fix: include last_sample_error in the SET clause + the broadcast. 2. **AST drift gate** (db/workspace_status_failed_message_drift_test.go) — Go AST walk that finds every db.DB.{Exec,Query,QueryRow}Context call whose argument list binds models.StatusFailed and asserts the SQL literal contains last_sample_error. Catches the next caller that drifts the same convention. Verified to FAIL against the bug shape (reverted importer.go temporarily — gate flagged the exact line) and PASS against the fix. Why an AST gate vs a regex: pre-fix attempt with a regex over UPDATE statements flagged status='online' / status='hibernating' / status= 'removed' UPDATEs as false positives. Walking the AST and only flagging calls that pass the StatusFailed constant eliminates that. Out of scope (filed separately if needed): - The Canvas E2E that surfaced the missing message (#2632) is now a required check on staging via PR #2827. Once this fix lands the next staging push should re-run #2632's failing case and produce a meaningful last_sample_error. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 21:08:08 -07:00 · 2026-05-04 21:08:08 -07:00 · 56149f8a24
commit 56149f8a24
parent 43caac911a
2 changed files with 186 additions and 3 deletions
--- a/workspace-server/internal/bundle/importer.go
+++ b/workspace-server/internal/bundle/importer.go
@ -131,11 +131,19 @@ func buildBundleConfigFiles(b *Bundle) map[string][]byte {
 }

 func markFailed(ctx context.Context, wsID string, broadcaster *events.Broadcaster, err error) {
+	// Set last_sample_error along with status so operators (and the
+	// Canvas E2E + GET /workspaces/:id callers) get a non-null reason
+	// in the row. Pre-2026-05-05 this UPDATE only set status, leaving
+	// last_sample_error NULL — Canvas E2E #2632 surfaced the gap with
+	// `Workspace failed: (no last_sample_error)`. Same UPDATE shape as
+	// markProvisionFailed in workspace-server/internal/handlers/
+	// workspace_provision_shared.go.
+	msg := err.Error()
 	db.DB.ExecContext(ctx,
-		`UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2`,
-		models.StatusFailed, wsID)
+		`UPDATE workspaces SET status = $1, last_sample_error = $2, updated_at = now() WHERE id = $3`,
+		models.StatusFailed, msg, wsID)
 	broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", wsID, map[string]interface{}{
-		"error": err.Error(),
+		"error": msg,
 	})
 }

--- a/workspace-server/internal/db/workspace_status_failed_message_drift_test.go
+++ b/workspace-server/internal/db/workspace_status_failed_message_drift_test.go
@ -0,0 +1,175 @@
+package db_test
+
+// Static drift gate: every UPDATE that sets status to a "failed" value
+// must also set last_sample_error in the same statement. Otherwise the
+// row ends up with status='failed' + last_sample_error=NULL — operators
+// see "workspace failed" with no reason, and the Canvas E2E reports the
+// useless `Workspace failed: (no last_sample_error)` from #2632.
+//
+// Why a static gate: pre-2026-05-05 we had at least two writers
+// (markProvisionFailed in workspace_provision_shared.go set the
+// message; bundle/importer.go's markFailed didn't). The provision-
+// timeout sweep also sets the message. Code review missed the
+// importer drift for ~6 months until the Canvas E2E surfaced it.
+//
+// Rule:
+//   - If a Go string literal in this repo contains both
+//     `UPDATE workspaces` and a clause setting `status` to a value
+//     resembling "failed" — either via a `$N` placeholder later bound
+//     to StatusFailed, or via an inline `'failed'` literal — that same
+//     literal MUST also contain `last_sample_error`.
+//   - Allowed: an UPDATE that only sets status to a non-failed value
+//     (online, hibernating, removed, etc.). Those don't need the
+//     message column, and clearing it would lose forensic context.
+//
+// Caveats:
+//   - The test reads source as text. Multi-line UPDATEs split across
+//     concatenated string fragments will slip past — that's an
+//     accepted limitation for now; the parameterized-write refactor
+//     (#2799) will let us replace this textual gate with a typed-call
+//     gate eventually.
+//   - "last_sample_error" appearing anywhere in the same literal is
+//     enough to satisfy the rule. We don't try to verify the column
+//     receives a non-empty value at runtime — that's the
+//     parameterized-write refactor's territory too.
+
+import (
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"testing"
+)
+
+// TestWorkspaceStatusFailed_MustSetLastSampleError uses Go's AST to find
+// every ExecContext call whose argument list includes the
+// `models.StatusFailed` constant. For each such call, the SQL literal
+// (the second argument) must also contain `last_sample_error`. This
+// catches the bug class without false-positive matches on UPDATEs that
+// set status to a non-failed value (online/hibernating/removed/etc.)
+// because those don't pass StatusFailed as an arg.
+func TestWorkspaceStatusFailed_MustSetLastSampleError(t *testing.T) {
+	root := findRepoRoot(t)
+	violations := []string{}
+
+	walkErr := filepath.Walk(filepath.Join(root, "workspace-server", "internal"), func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info.IsDir() {
+			return nil
+		}
+		if filepath.Ext(path) != ".go" {
+			return nil
+		}
+		if strings.HasSuffix(path, "_test.go") {
+			return nil
+		}
+		fset := token.NewFileSet()
+		f, err := parser.ParseFile(fset, path, nil, parser.SkipObjectResolution)
+		if err != nil {
+			return err
+		}
+		ast.Inspect(f, func(n ast.Node) bool {
+			call, ok := n.(*ast.CallExpr)
+			if !ok {
+				return true
+			}
+			sel, ok := call.Fun.(*ast.SelectorExpr)
+			if !ok {
+				return true
+			}
+			// Match db.DB.ExecContext / db.DB.QueryContext / db.DB.QueryRowContext
+			// — the three SQL execution surfaces this codebase uses.
+			methodName := sel.Sel.Name
+			if methodName != "ExecContext" && methodName != "QueryContext" && methodName != "QueryRowContext" {
+				return true
+			}
+			// Args: 0=ctx, 1=sql-literal, 2..=bind vars.
+			if len(call.Args) < 3 {
+				return true
+			}
+			passesStatusFailed := false
+			for _, a := range call.Args[2:] {
+				if isStatusFailedRef(a) {
+					passesStatusFailed = true
+					break
+				}
+			}
+			if !passesStatusFailed {
+				return true
+			}
+			// SQL literal — usually `*ast.BasicLit` for a single-line
+			// string or a back-tick string. May also be a const ref.
+			sqlText := extractStringLit(call.Args[1])
+			if sqlText == "" {
+				// SQL is a name reference, not a literal — can't check.
+				return true
+			}
+			if strings.Contains(sqlText, "last_sample_error") {
+				return true
+			}
+			// Skip non-UPDATE statements that happen to pass StatusFailed
+			// (e.g. SELECT … WHERE status = $1). The drift target is
+			// specifically writes that mark the row failed.
+			if !regexp.MustCompile(`(?i)\bUPDATE\s+workspaces\b`).MatchString(sqlText) {
+				return true
+			}
+			rel, _ := filepath.Rel(root, path)
+			pos := fset.Position(call.Pos())
+			snippet := strings.TrimSpace(sqlText)
+			if len(snippet) > 120 {
+				snippet = snippet[:120] + "..."
+			}
+			violations = append(violations,
+				fmt.Sprintf("%s:%d: %s", rel, pos.Line, snippet))
+			return true
+		})
+		return nil
+	})
+	if walkErr != nil {
+		t.Fatalf("walk: %v", walkErr)
+	}
+
+	if len(violations) > 0 {
+		t.Errorf("UPDATE workspaces SET status = ... binds models.StatusFailed but the SQL literal does not write last_sample_error — every code path that marks a workspace failed must also write the reason, or operators see `Workspace failed: (no last_sample_error)` (incident: Canvas E2E #2632). Add `, last_sample_error = $N` to the SET clause.\n\nViolations:\n  - %s",
+			strings.Join(violations, "\n  - "))
+	}
+}
+
+// isStatusFailedRef returns true if expr resolves to models.StatusFailed
+// (selector StatusFailed off the models package). Catches both
+// `models.StatusFailed` directly and `models.StatusFailed.String()`
+// style usages — anything that names the constant.
+func isStatusFailedRef(expr ast.Expr) bool {
+	if sel, ok := expr.(*ast.SelectorExpr); ok {
+		if sel.Sel.Name == "StatusFailed" {
+			return true
+		}
+	}
+	return false
+}
+
+// extractStringLit returns the unquoted contents of a string literal
+// expression, or "" if expr is not a literal we can read statically
+// (e.g. concatenation, function-call argument, named const reference).
+func extractStringLit(expr ast.Expr) string {
+	lit, ok := expr.(*ast.BasicLit)
+	if !ok || lit.Kind != token.STRING {
+		return ""
+	}
+	val := lit.Value
+	if len(val) >= 2 {
+		first, last := val[0], val[len(val)-1]
+		if (first == '`' && last == '`') || (first == '"' && last == '"') {
+			return val[1 : len(val)-1]
+		}
+	}
+	return val
+}
+
+