fix(bundle): markFailed sets last_sample_error + AST gate
Closes the bug class surfaced by Canvas E2E #2632: a workspace ends up status='failed' with last_sample_error=NULL, and operators (or the E2E poll loop) see the useless "Workspace failed: (no last_sample_error)" with no triage signal. Two pieces: 1. **bundle/importer.go markFailed** — the UPDATE was setting only status, leaving last_sample_error NULL. Same incident class as the silent-drop bugs in PRs #2811 + #2824, different code path. markProvisionFailed in workspace_provision_shared.go has set the message column for a long time; this writer drifted the convention. Fix: include last_sample_error in the SET clause + the broadcast. 2. **AST drift gate** (db/workspace_status_failed_message_drift_test.go) — Go AST walk that finds every db.DB.{Exec,Query,QueryRow}Context call whose argument list binds models.StatusFailed and asserts the SQL literal contains last_sample_error. Catches the next caller that drifts the same convention. Verified to FAIL against the bug shape (reverted importer.go temporarily — gate flagged the exact line) and PASS against the fix. Why an AST gate vs a regex: pre-fix attempt with a regex over UPDATE statements flagged status='online' / status='hibernating' / status= 'removed' UPDATEs as false positives. Walking the AST and only flagging calls that pass the StatusFailed constant eliminates that. Out of scope (filed separately if needed): - The Canvas E2E that surfaced the missing message (#2632) is now a required check on staging via PR #2827. Once this fix lands the next staging push should re-run #2632's failing case and produce a meaningful last_sample_error. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
43caac911a
commit
56149f8a24
@ -131,11 +131,19 @@ func buildBundleConfigFiles(b *Bundle) map[string][]byte {
|
||||
}
|
||||
|
||||
func markFailed(ctx context.Context, wsID string, broadcaster *events.Broadcaster, err error) {
|
||||
// Set last_sample_error along with status so operators (and the
|
||||
// Canvas E2E + GET /workspaces/:id callers) get a non-null reason
|
||||
// in the row. Pre-2026-05-05 this UPDATE only set status, leaving
|
||||
// last_sample_error NULL — Canvas E2E #2632 surfaced the gap with
|
||||
// `Workspace failed: (no last_sample_error)`. Same UPDATE shape as
|
||||
// markProvisionFailed in workspace-server/internal/handlers/
|
||||
// workspace_provision_shared.go.
|
||||
msg := err.Error()
|
||||
db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2`,
|
||||
models.StatusFailed, wsID)
|
||||
`UPDATE workspaces SET status = $1, last_sample_error = $2, updated_at = now() WHERE id = $3`,
|
||||
models.StatusFailed, msg, wsID)
|
||||
broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", wsID, map[string]interface{}{
|
||||
"error": err.Error(),
|
||||
"error": msg,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,175 @@
|
||||
package db_test
|
||||
|
||||
// Static drift gate: every UPDATE that sets status to a "failed" value
|
||||
// must also set last_sample_error in the same statement. Otherwise the
|
||||
// row ends up with status='failed' + last_sample_error=NULL — operators
|
||||
// see "workspace failed" with no reason, and the Canvas E2E reports the
|
||||
// useless `Workspace failed: (no last_sample_error)` from #2632.
|
||||
//
|
||||
// Why a static gate: pre-2026-05-05 we had at least two writers
|
||||
// (markProvisionFailed in workspace_provision_shared.go set the
|
||||
// message; bundle/importer.go's markFailed didn't). The provision-
|
||||
// timeout sweep also sets the message. Code review missed the
|
||||
// importer drift for ~6 months until the Canvas E2E surfaced it.
|
||||
//
|
||||
// Rule:
|
||||
// - If a Go string literal in this repo contains both
|
||||
// `UPDATE workspaces` and a clause setting `status` to a value
|
||||
// resembling "failed" — either via a `$N` placeholder later bound
|
||||
// to StatusFailed, or via an inline `'failed'` literal — that same
|
||||
// literal MUST also contain `last_sample_error`.
|
||||
// - Allowed: an UPDATE that only sets status to a non-failed value
|
||||
// (online, hibernating, removed, etc.). Those don't need the
|
||||
// message column, and clearing it would lose forensic context.
|
||||
//
|
||||
// Caveats:
|
||||
// - The test reads source as text. Multi-line UPDATEs split across
|
||||
// concatenated string fragments will slip past — that's an
|
||||
// accepted limitation for now; the parameterized-write refactor
|
||||
// (#2799) will let us replace this textual gate with a typed-call
|
||||
// gate eventually.
|
||||
// - "last_sample_error" appearing anywhere in the same literal is
|
||||
// enough to satisfy the rule. We don't try to verify the column
|
||||
// receives a non-empty value at runtime — that's the
|
||||
// parameterized-write refactor's territory too.
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"go/ast"
|
||||
"go/parser"
|
||||
"go/token"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestWorkspaceStatusFailed_MustSetLastSampleError uses Go's AST to find
|
||||
// every ExecContext call whose argument list includes the
|
||||
// `models.StatusFailed` constant. For each such call, the SQL literal
|
||||
// (the second argument) must also contain `last_sample_error`. This
|
||||
// catches the bug class without false-positive matches on UPDATEs that
|
||||
// set status to a non-failed value (online/hibernating/removed/etc.)
|
||||
// because those don't pass StatusFailed as an arg.
|
||||
func TestWorkspaceStatusFailed_MustSetLastSampleError(t *testing.T) {
|
||||
root := findRepoRoot(t)
|
||||
violations := []string{}
|
||||
|
||||
walkErr := filepath.Walk(filepath.Join(root, "workspace-server", "internal"), func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
if filepath.Ext(path) != ".go" {
|
||||
return nil
|
||||
}
|
||||
if strings.HasSuffix(path, "_test.go") {
|
||||
return nil
|
||||
}
|
||||
fset := token.NewFileSet()
|
||||
f, err := parser.ParseFile(fset, path, nil, parser.SkipObjectResolution)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ast.Inspect(f, func(n ast.Node) bool {
|
||||
call, ok := n.(*ast.CallExpr)
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
sel, ok := call.Fun.(*ast.SelectorExpr)
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
// Match db.DB.ExecContext / db.DB.QueryContext / db.DB.QueryRowContext
|
||||
// — the three SQL execution surfaces this codebase uses.
|
||||
methodName := sel.Sel.Name
|
||||
if methodName != "ExecContext" && methodName != "QueryContext" && methodName != "QueryRowContext" {
|
||||
return true
|
||||
}
|
||||
// Args: 0=ctx, 1=sql-literal, 2..=bind vars.
|
||||
if len(call.Args) < 3 {
|
||||
return true
|
||||
}
|
||||
passesStatusFailed := false
|
||||
for _, a := range call.Args[2:] {
|
||||
if isStatusFailedRef(a) {
|
||||
passesStatusFailed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !passesStatusFailed {
|
||||
return true
|
||||
}
|
||||
// SQL literal — usually `*ast.BasicLit` for a single-line
|
||||
// string or a back-tick string. May also be a const ref.
|
||||
sqlText := extractStringLit(call.Args[1])
|
||||
if sqlText == "" {
|
||||
// SQL is a name reference, not a literal — can't check.
|
||||
return true
|
||||
}
|
||||
if strings.Contains(sqlText, "last_sample_error") {
|
||||
return true
|
||||
}
|
||||
// Skip non-UPDATE statements that happen to pass StatusFailed
|
||||
// (e.g. SELECT … WHERE status = $1). The drift target is
|
||||
// specifically writes that mark the row failed.
|
||||
if !regexp.MustCompile(`(?i)\bUPDATE\s+workspaces\b`).MatchString(sqlText) {
|
||||
return true
|
||||
}
|
||||
rel, _ := filepath.Rel(root, path)
|
||||
pos := fset.Position(call.Pos())
|
||||
snippet := strings.TrimSpace(sqlText)
|
||||
if len(snippet) > 120 {
|
||||
snippet = snippet[:120] + "..."
|
||||
}
|
||||
violations = append(violations,
|
||||
fmt.Sprintf("%s:%d: %s", rel, pos.Line, snippet))
|
||||
return true
|
||||
})
|
||||
return nil
|
||||
})
|
||||
if walkErr != nil {
|
||||
t.Fatalf("walk: %v", walkErr)
|
||||
}
|
||||
|
||||
if len(violations) > 0 {
|
||||
t.Errorf("UPDATE workspaces SET status = ... binds models.StatusFailed but the SQL literal does not write last_sample_error — every code path that marks a workspace failed must also write the reason, or operators see `Workspace failed: (no last_sample_error)` (incident: Canvas E2E #2632). Add `, last_sample_error = $N` to the SET clause.\n\nViolations:\n - %s",
|
||||
strings.Join(violations, "\n - "))
|
||||
}
|
||||
}
|
||||
|
||||
// isStatusFailedRef returns true if expr resolves to models.StatusFailed
|
||||
// (selector StatusFailed off the models package). Catches both
|
||||
// `models.StatusFailed` directly and `models.StatusFailed.String()`
|
||||
// style usages — anything that names the constant.
|
||||
func isStatusFailedRef(expr ast.Expr) bool {
|
||||
if sel, ok := expr.(*ast.SelectorExpr); ok {
|
||||
if sel.Sel.Name == "StatusFailed" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// extractStringLit returns the unquoted contents of a string literal
|
||||
// expression, or "" if expr is not a literal we can read statically
|
||||
// (e.g. concatenation, function-call argument, named const reference).
|
||||
func extractStringLit(expr ast.Expr) string {
|
||||
lit, ok := expr.(*ast.BasicLit)
|
||||
if !ok || lit.Kind != token.STRING {
|
||||
return ""
|
||||
}
|
||||
val := lit.Value
|
||||
if len(val) >= 2 {
|
||||
first, last := val[0], val[len(val)-1]
|
||||
if (first == '`' && last == '`') || (first == '"' && last == '"') {
|
||||
return val[1 : len(val)-1]
|
||||
}
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user