forked from molecule-ai/molecule-core
fix(provision): entry log + panic recovery on workspace provision goroutines
Issue #2486: 7 claude-code workspaces stuck in provisioning produced NONE of the four documented exit-path log lines in provisionWorkspaceCP — neither prepare-failed, nor start-failed, nor persist-instance-id-failed, nor success. Operators couldn't tell whether the goroutine ran at all. Add an entry log at the top of provisionWorkspaceOpts + provisionWorkspaceCP so a missing entry distinguishes "goroutine never started" from "started but exited via an unlogged path." Add logProvisionPanic at the same defer site so a panic inside either provisioner doesn't (a) crash the whole workspace-server process, taking every other tenant workspace with it, and (b) silently leave the row in `provisioning` until the 10-min sweeper fires. The recover persists status='failed' with a sanitized panic-class message via a fresh 10s context (the goroutine's own ctx may have been the one panicking). Tests pin three contracts: - no-op when no panic (otherwise every successful provision emits a spurious log line) - recovers + persists failed status on panic, with stack trace - defense-in-depth: if the persist itself fails, log it instead of leaving the operator with a recovered-panic log but no row Regression-injected by neutering the recover() body — all three tests fail until the recover + UPDATE path is restored. This is observability + resilience only, not a root-cause fix for #2486. The actual silent-drop class still needs reproduction once the tenant is on a build that includes this entry log. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
88b156a3bc
commit
46daae1ffb
@ -6,7 +6,9 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
@ -15,6 +17,37 @@ import (
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
|
||||
)
|
||||
|
||||
// logProvisionPanic is the deferred recover at the top of every provision
|
||||
// goroutine. Without it, a panic inside provisionWorkspaceOpts /
|
||||
// provisionWorkspaceCP propagates up the goroutine stack and crashes the
|
||||
// whole workspace-server process — taking every other tenant workspace
|
||||
// down with it. With it, the panic is logged with a stack trace, the
|
||||
// workspace gets persistently marked failed (so the canvas surfaces
|
||||
// something instead of leaving it stuck in provisioning until the 10-min
|
||||
// sweeper fires), and the rest of the process keeps serving.
|
||||
//
|
||||
// Issue #2486 added this after the symmetric class — silent goroutine
|
||||
// exit, no log, no failure mark — was observed in prod. Even if the
|
||||
// root cause turns out not to be a panic, surfacing the panic class
|
||||
// closes one branch of "what could have happened" cleanly.
|
||||
func logProvisionPanic(workspaceID, mode string) {
|
||||
r := recover()
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
log.Printf("Provisioner: PANIC during provision goroutine for %s (mode=%s): %v\nstack:\n%s",
|
||||
workspaceID, mode, r, debug.Stack())
|
||||
// Best-effort mark-failed via a fresh context — the provision goroutine's
|
||||
// ctx may have been the one panicking. 10s is enough for a single UPDATE.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status='failed', last_sample_error=$2, updated_at=now() WHERE id=$1`,
|
||||
workspaceID, fmt.Sprintf("provision panic: %v", r)); err != nil {
|
||||
log.Printf("Provisioner: failed to persist panic-failure for %s: %v", workspaceID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// provisionWorkspace handles async container deployment with timeout.
|
||||
func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) {
|
||||
h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, false)
|
||||
@ -25,6 +58,14 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string,
|
||||
// that should NOT be persisted on CreateWorkspacePayload because they're
|
||||
// request-scoped flags.
|
||||
func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) {
|
||||
// Entry log — distinguishes "goroutine never started" from "started but
|
||||
// exited via an unlogged path" when debugging stuck-in-provisioning
|
||||
// rows. Issue #2486: 7 claude-code workspaces stuck in provisioning had
|
||||
// neither a prepare-failed nor start-failed nor success log line, so an
|
||||
// operator couldn't tell whether the goroutine ran at all.
|
||||
log.Printf("Provisioner: goroutine entered for %s (runtime=%s, mode=docker)", workspaceID, payload.Runtime)
|
||||
defer logProvisionPanic(workspaceID, "docker")
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
|
||||
defer cancel()
|
||||
|
||||
@ -640,6 +681,14 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
|
||||
// share so the next mint added can't be silently forgotten on one
|
||||
// side.
|
||||
func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) {
|
||||
// Entry log + panic recovery — see provisionWorkspaceOpts for rationale.
|
||||
// Issue #2486: 7 claude-code workspaces stuck in provisioning produced
|
||||
// none of the four documented exit-path log lines, leaving operators
|
||||
// unable to distinguish "goroutine never started" from "started but
|
||||
// returned via an unlogged path."
|
||||
log.Printf("CPProvisioner: goroutine entered for %s (runtime=%s, mode=cp)", workspaceID, payload.Runtime)
|
||||
defer logProvisionPanic(workspaceID, "cp")
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
|
||||
defer cancel()
|
||||
|
||||
|
||||
@ -0,0 +1,129 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"database/sql"
|
||||
"log"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
)
|
||||
|
||||
// Pin the issue #2486 contract: a panic inside the provision goroutine must
|
||||
// (1) not propagate (the deferred recover swallows it), (2) log the panic
|
||||
// with a stack trace so an operator can see what blew up, and (3) mark the
|
||||
// workspace `failed` so the canvas surfaces the failure instead of the row
|
||||
// sitting in `provisioning` until the 10-min sweeper.
|
||||
|
||||
func TestLogProvisionPanic_NoOpWhenNoPanic(t *testing.T) {
|
||||
// Sanity: the deferred recover must be silent when nothing panicked.
|
||||
// Otherwise every successful provision would emit a spurious panic log.
|
||||
var buf bytes.Buffer
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(log.Writer())
|
||||
|
||||
func() {
|
||||
defer logProvisionPanic("ws-no-panic", "cp")
|
||||
// no panic
|
||||
}()
|
||||
|
||||
if buf.Len() != 0 {
|
||||
t.Fatalf("expected no log output when no panic, got: %q", buf.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
|
||||
// Wire a sqlmock so logProvisionPanic's UPDATE has somewhere to land
|
||||
// without needing a real Postgres. The mock asserts the SQL shape +
|
||||
// args so a future refactor of the persist call doesn't silently
|
||||
// stop marking the row failed.
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatalf("sqlmock.New: %v", err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
|
||||
prevDB := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prevDB }()
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces SET status='failed'`).
|
||||
WithArgs("ws-panic", sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
var buf bytes.Buffer
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(log.Writer())
|
||||
|
||||
// Exercise: a function that defers logProvisionPanic + then panics.
|
||||
// The recover MUST swallow the panic — if it propagates, the test
|
||||
// process crashes and the panic message bubbles up as a Go test
|
||||
// failure rather than the assertion below.
|
||||
didNotPanic := true
|
||||
func() {
|
||||
defer func() {
|
||||
// If logProvisionPanic re-raised, this catches it for the
|
||||
// test. We assert below that it did NOT re-raise.
|
||||
if r := recover(); r != nil {
|
||||
didNotPanic = false
|
||||
}
|
||||
}()
|
||||
defer logProvisionPanic("ws-panic", "cp")
|
||||
panic("simulated provision panic for #2486 regression")
|
||||
}()
|
||||
|
||||
if !didNotPanic {
|
||||
t.Fatal("logProvisionPanic re-raised the panic — the recover() arm did not swallow it")
|
||||
}
|
||||
|
||||
logged := buf.String()
|
||||
if !strings.Contains(logged, "PANIC during provision goroutine for ws-panic") {
|
||||
t.Errorf("missing panic-class log line; got: %q", logged)
|
||||
}
|
||||
if !strings.Contains(logged, "simulated provision panic for #2486 regression") {
|
||||
t.Errorf("panic value not logged; got: %q", logged)
|
||||
}
|
||||
if !strings.Contains(logged, "stack:") {
|
||||
t.Errorf("missing stack trace marker; got: %q", logged)
|
||||
}
|
||||
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sql expectations: %v — UPDATE workspaces … status='failed' was not issued", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogProvisionPanic_PersistFailureLogged(t *testing.T) {
|
||||
// Defense-in-depth: if the panic-mark UPDATE itself fails, log it
|
||||
// rather than swallow silently. Otherwise an operator sees the
|
||||
// panic-class log line but no persistent-failure row, leaving the
|
||||
// workspace in `provisioning` with a misleading "we recovered" log.
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatalf("sqlmock.New: %v", err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
|
||||
prevDB := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prevDB }()
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces SET status='failed'`).
|
||||
WithArgs("ws-panic-persist-fail", sqlmock.AnyArg()).
|
||||
WillReturnError(sql.ErrConnDone)
|
||||
|
||||
var buf bytes.Buffer
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(log.Writer())
|
||||
|
||||
func() {
|
||||
defer logProvisionPanic("ws-panic-persist-fail", "docker")
|
||||
panic("simulated panic with DB unavailable")
|
||||
}()
|
||||
|
||||
logged := buf.String()
|
||||
if !strings.Contains(logged, "failed to persist panic-failure for ws-panic-persist-fail") {
|
||||
t.Errorf("expected persist-failure log line; got: %q", logged)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user