diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index cdf60d90..a4c37c37 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -6,7 +6,9 @@ import ( "log" "os" "path/filepath" + "runtime/debug" "strings" + "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" @@ -15,6 +17,37 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth" ) +// logProvisionPanic is the deferred recover at the top of every provision +// goroutine. Without it, a panic inside provisionWorkspaceOpts / +// provisionWorkspaceCP propagates up the goroutine stack and crashes the +// whole workspace-server process — taking every other tenant workspace +// down with it. With it, the panic is logged with a stack trace, the +// workspace gets persistently marked failed (so the canvas surfaces +// something instead of leaving it stuck in provisioning until the 10-min +// sweeper fires), and the rest of the process keeps serving. +// +// Issue #2486 added this after the symmetric class — silent goroutine +// exit, no log, no failure mark — was observed in prod. Even if the +// root cause turns out not to be a panic, surfacing the panic class +// closes one branch of "what could have happened" cleanly. +func logProvisionPanic(workspaceID, mode string) { + r := recover() + if r == nil { + return + } + log.Printf("Provisioner: PANIC during provision goroutine for %s (mode=%s): %v\nstack:\n%s", + workspaceID, mode, r, debug.Stack()) + // Best-effort mark-failed via a fresh context — the provision goroutine's + // ctx may have been the one panicking. 10s is enough for a single UPDATE. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if _, err := db.DB.ExecContext(ctx, + `UPDATE workspaces SET status='failed', last_sample_error=$2, updated_at=now() WHERE id=$1`, + workspaceID, fmt.Sprintf("provision panic: %v", r)); err != nil { + log.Printf("Provisioner: failed to persist panic-failure for %s: %v", workspaceID, err) + } +} + // provisionWorkspace handles async container deployment with timeout. func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) { h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, false) @@ -25,6 +58,14 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, // that should NOT be persisted on CreateWorkspacePayload because they're // request-scoped flags. func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) { + // Entry log — distinguishes "goroutine never started" from "started but + // exited via an unlogged path" when debugging stuck-in-provisioning + // rows. Issue #2486: 7 claude-code workspaces stuck in provisioning had + // neither a prepare-failed nor start-failed nor success log line, so an + // operator couldn't tell whether the goroutine ran at all. + log.Printf("Provisioner: goroutine entered for %s (runtime=%s, mode=docker)", workspaceID, payload.Runtime) + defer logProvisionPanic(workspaceID, "docker") + ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout) defer cancel() @@ -640,6 +681,14 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s // share so the next mint added can't be silently forgotten on one // side. func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) { + // Entry log + panic recovery — see provisionWorkspaceOpts for rationale. + // Issue #2486: 7 claude-code workspaces stuck in provisioning produced + // none of the four documented exit-path log lines, leaving operators + // unable to distinguish "goroutine never started" from "started but + // returned via an unlogged path." + log.Printf("CPProvisioner: goroutine entered for %s (runtime=%s, mode=cp)", workspaceID, payload.Runtime) + defer logProvisionPanic(workspaceID, "cp") + ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout) defer cancel() diff --git a/workspace-server/internal/handlers/workspace_provision_panic_test.go b/workspace-server/internal/handlers/workspace_provision_panic_test.go new file mode 100644 index 00000000..0a287fb3 --- /dev/null +++ b/workspace-server/internal/handlers/workspace_provision_panic_test.go @@ -0,0 +1,129 @@ +package handlers + +import ( + "bytes" + "database/sql" + "log" + "strings" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" +) + +// Pin the issue #2486 contract: a panic inside the provision goroutine must +// (1) not propagate (the deferred recover swallows it), (2) log the panic +// with a stack trace so an operator can see what blew up, and (3) mark the +// workspace `failed` so the canvas surfaces the failure instead of the row +// sitting in `provisioning` until the 10-min sweeper. + +func TestLogProvisionPanic_NoOpWhenNoPanic(t *testing.T) { + // Sanity: the deferred recover must be silent when nothing panicked. + // Otherwise every successful provision would emit a spurious panic log. + var buf bytes.Buffer + log.SetOutput(&buf) + defer log.SetOutput(log.Writer()) + + func() { + defer logProvisionPanic("ws-no-panic", "cp") + // no panic + }() + + if buf.Len() != 0 { + t.Fatalf("expected no log output when no panic, got: %q", buf.String()) + } +} + +func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) { + // Wire a sqlmock so logProvisionPanic's UPDATE has somewhere to land + // without needing a real Postgres. The mock asserts the SQL shape + + // args so a future refactor of the persist call doesn't silently + // stop marking the row failed. + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + prevDB := db.DB + db.DB = mockDB + defer func() { db.DB = prevDB }() + + mock.ExpectExec(`UPDATE workspaces SET status='failed'`). + WithArgs("ws-panic", sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + var buf bytes.Buffer + log.SetOutput(&buf) + defer log.SetOutput(log.Writer()) + + // Exercise: a function that defers logProvisionPanic + then panics. + // The recover MUST swallow the panic — if it propagates, the test + // process crashes and the panic message bubbles up as a Go test + // failure rather than the assertion below. + didNotPanic := true + func() { + defer func() { + // If logProvisionPanic re-raised, this catches it for the + // test. We assert below that it did NOT re-raise. + if r := recover(); r != nil { + didNotPanic = false + } + }() + defer logProvisionPanic("ws-panic", "cp") + panic("simulated provision panic for #2486 regression") + }() + + if !didNotPanic { + t.Fatal("logProvisionPanic re-raised the panic — the recover() arm did not swallow it") + } + + logged := buf.String() + if !strings.Contains(logged, "PANIC during provision goroutine for ws-panic") { + t.Errorf("missing panic-class log line; got: %q", logged) + } + if !strings.Contains(logged, "simulated provision panic for #2486 regression") { + t.Errorf("panic value not logged; got: %q", logged) + } + if !strings.Contains(logged, "stack:") { + t.Errorf("missing stack trace marker; got: %q", logged) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sql expectations: %v — UPDATE workspaces … status='failed' was not issued", err) + } +} + +func TestLogProvisionPanic_PersistFailureLogged(t *testing.T) { + // Defense-in-depth: if the panic-mark UPDATE itself fails, log it + // rather than swallow silently. Otherwise an operator sees the + // panic-class log line but no persistent-failure row, leaving the + // workspace in `provisioning` with a misleading "we recovered" log. + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + prevDB := db.DB + db.DB = mockDB + defer func() { db.DB = prevDB }() + + mock.ExpectExec(`UPDATE workspaces SET status='failed'`). + WithArgs("ws-panic-persist-fail", sqlmock.AnyArg()). + WillReturnError(sql.ErrConnDone) + + var buf bytes.Buffer + log.SetOutput(&buf) + defer log.SetOutput(log.Writer()) + + func() { + defer logProvisionPanic("ws-panic-persist-fail", "docker") + panic("simulated panic with DB unavailable") + }() + + logged := buf.String() + if !strings.Contains(logged, "failed to persist panic-failure for ws-panic-persist-fail") { + t.Errorf("expected persist-failure log line; got: %q", logged) + } +}