diff --git a/workspace-server/internal/handlers/org_import.go b/workspace-server/internal/handlers/org_import.go index 8f4d9a07..639c8ba9 100644 --- a/workspace-server/internal/handlers/org_import.go +++ b/workspace-server/internal/handlers/org_import.go @@ -21,6 +21,7 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provlog" "github.com/Molecule-AI/molecule-monorepo/platform/internal/scheduler" "github.com/google/uuid" ) @@ -96,6 +97,16 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX } if existing { log.Printf("Org import: %q already exists (id=%s) — skipping create+provision, recursing into children for partial-match", ws.Name, existingID) + parentRef := "" + if parentID != nil { + parentRef = *parentID + } + provlog.Event("provision.skip_existing", map[string]any{ + "name": ws.Name, + "existing_id": existingID, + "parent_id": parentRef, + "tier": tier, + }) *results = append(*results, map[string]interface{}{ "id": existingID, "name": ws.Name, diff --git a/workspace-server/internal/handlers/provlog_emit_test.go b/workspace-server/internal/handlers/provlog_emit_test.go new file mode 100644 index 00000000..6681c203 --- /dev/null +++ b/workspace-server/internal/handlers/provlog_emit_test.go @@ -0,0 +1,112 @@ +package handlers + +// provlog_emit_test.go — pins that the structured-logging emit sites +// added for #2867 PR-D actually fire when their boundary is crossed. +// +// These are call-site contract tests, not provlog package tests (those +// live next to the helper). The assertion is "this dispatcher path +// emits this event name" — if a refactor moves the call out of the +// boundary helper, the gate fails. Fields are NOT pinned here on +// purpose; the field set is convenience for ops, not contract for the +// emit point. Pinning fields would block additive evolution of the +// payload (see also feedback_behavior_based_ast_gates.md). + +import ( + "bytes" + "context" + "log" + "strings" + "sync" + "testing" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" +) + +// captureProvLog redirects the global logger to a buffer for the test +// duration. provlog.Event uses log.Printf, so this is the only seam. +// Returned mutex protects against concurrent reads from the goroutine +// fired by provisionWorkspaceAuto (the goroutine never returns in +// these tests because Start() is stubbed, but the buffer can still be +// touched by it racing the assertion). +func captureProvLog(t *testing.T) (read func() string) { + t.Helper() + var buf bytes.Buffer + var mu sync.Mutex + prevWriter := log.Writer() + prevFlags := log.Flags() + log.SetFlags(0) + log.SetOutput(&safeWriter{buf: &buf, mu: &mu}) + t.Cleanup(func() { + log.SetOutput(prevWriter) + log.SetFlags(prevFlags) + }) + return func() string { + mu.Lock() + defer mu.Unlock() + return buf.String() + } +} + +// TestProvisionWorkspaceAutoSync_EmitsProvisionStart — sync variant is +// chosen for the assertion path because it returns once the (stubbed) +// Start() has been called, so we know the emit has flushed. The async +// variant would race a goroutine. +func TestProvisionWorkspaceAutoSync_EmitsProvisionStart(t *testing.T) { + read := captureProvLog(t) + h := &WorkspaceHandler{cpProv: &trackingCPProv{}} + // Best-effort: the body will hit DB code under provisionWorkspaceCP + // — we only need the emit at the entry, which fires unconditionally + // before the dispatch. Recovering from any later panic keeps the + // test focused. + defer func() { _ = recover() }() + h.provisionWorkspaceAutoSync("ws-test-1", "tmpl", nil, models.CreateWorkspacePayload{ + Name: "n", Tier: 4, Runtime: "claude-code", + }) + got := read() + if !strings.Contains(got, "evt: provision.start ") { + t.Fatalf("expected provision.start emit, got log:\n%s", got) + } + if !strings.Contains(got, `"workspace_id":"ws-test-1"`) { + t.Errorf("workspace_id not in payload: %s", got) + } + if !strings.Contains(got, `"sync":true`) { + t.Errorf("sync flag not pinned for sync dispatcher: %s", got) + } +} + +// TestStopForRestart_EmitsRestartPreStop — emit fires before the actual +// Stop call, so the trackingCPProv stub doesn't need to be wired for +// real Stop semantics. Backend label "cp" pinned because that's the +// SaaS path; we don't pin "docker" or "none" branches here (separate +// tests would only re-test the trivial branch label switch). +func TestStopForRestart_EmitsRestartPreStop(t *testing.T) { + read := captureProvLog(t) + h := &WorkspaceHandler{cpProv: &trackingCPProv{}} + defer func() { _ = recover() }() + h.stopForRestart(context.Background(), "ws-restart-1") + got := read() + if !strings.Contains(got, "evt: restart.pre_stop ") { + t.Fatalf("expected restart.pre_stop emit, got log:\n%s", got) + } + if !strings.Contains(got, `"workspace_id":"ws-restart-1"`) { + t.Errorf("workspace_id not in payload: %s", got) + } + if !strings.Contains(got, `"backend":"cp"`) { + t.Errorf("backend label missing or wrong: %s", got) + } +} + +// TestStopForRestart_EmitsBackendNoneWhenUnwired — pin the no-backend +// branch so a future refactor that drops the label switch is caught. +// This is the silent-Stop case (workspace_dispatchers.go:StopWorkspaceAuto +// returns nil for unwired backends); the emit ensures the operator can +// still see the boundary in the log. +func TestStopForRestart_EmitsBackendNoneWhenUnwired(t *testing.T) { + read := captureProvLog(t) + h := &WorkspaceHandler{} // both nil + h.stopForRestart(context.Background(), "ws-restart-2") + got := read() + if !strings.Contains(got, `"backend":"none"`) { + t.Fatalf("expected backend=none for unwired handler: %s", got) + } +} diff --git a/workspace-server/internal/handlers/workspace_dispatchers.go b/workspace-server/internal/handlers/workspace_dispatchers.go index 18ede255..3df25877 100644 --- a/workspace-server/internal/handlers/workspace_dispatchers.go +++ b/workspace-server/internal/handlers/workspace_dispatchers.go @@ -35,6 +35,7 @@ import ( "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provlog" ) // HasProvisioner reports whether either backend (CP or local Docker) is @@ -101,6 +102,14 @@ func (h *WorkspaceHandler) DefaultTier() int { // lives in prepareProvisionContext (shared by both per-backend // goroutines). func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool { + provlog.Event("provision.start", map[string]any{ + "workspace_id": workspaceID, + "name": payload.Name, + "tier": payload.Tier, + "runtime": payload.Runtime, + "template": payload.Template, + "sync": false, + }) if h.cpProv != nil { go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) return true @@ -136,6 +145,14 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri // Keep these two helpers in sync — when one grows a new arm (third // backend, retry semantics), the other should too. func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool { + provlog.Event("provision.start", map[string]any{ + "workspace_id": workspaceID, + "name": payload.Name, + "tier": payload.Tier, + "runtime": payload.Runtime, + "template": payload.Template, + "sync": true, + }) if h.cpProv != nil { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) return true diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 3b3097c4..c5712be5 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -12,6 +12,7 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provlog" "github.com/gin-gonic/gin" ) @@ -431,6 +432,16 @@ func coalesceRestart(workspaceID string, cycle func()) { // NPE'd before reaching the reprovision step — which is why every SaaS dead- // agent incident pre-this-fix required manual restart from canvas. func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) { + backend := "none" + if h.provisioner != nil { + backend = "docker" + } else if h.cpProv != nil { + backend = "cp" + } + provlog.Event("restart.pre_stop", map[string]any{ + "workspace_id": workspaceID, + "backend": backend, + }) if h.provisioner != nil { h.provisioner.Stop(ctx, workspaceID) return diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go index edc67d9f..bdc5bff7 100644 --- a/workspace-server/internal/provisioner/cp_provisioner.go +++ b/workspace-server/internal/provisioner/cp_provisioner.go @@ -14,6 +14,7 @@ import ( "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provlog" ) // CPProvisionerAPI is the contract WorkspaceHandler uses to talk to the @@ -214,6 +215,13 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, } log.Printf("CP provisioner: workspace %s → EC2 instance %s (%s)", cfg.WorkspaceID, result.InstanceID, result.State) + provlog.Event("provision.ec2_started", map[string]any{ + "workspace_id": cfg.WorkspaceID, + "instance_id": result.InstanceID, + "state": result.State, + "tier": cfg.Tier, + "runtime": cfg.Runtime, + }) return result.InstanceID, nil } @@ -273,6 +281,10 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { return fmt.Errorf("cp provisioner: stop %s: unexpected %d: %s", workspaceID, resp.StatusCode, strings.TrimSpace(string(body))) } + provlog.Event("provision.ec2_stopped", map[string]any{ + "workspace_id": workspaceID, + "instance_id": instanceID, + }) return nil } diff --git a/workspace-server/internal/provlog/provlog.go b/workspace-server/internal/provlog/provlog.go new file mode 100644 index 00000000..4434c238 --- /dev/null +++ b/workspace-server/internal/provlog/provlog.go @@ -0,0 +1,48 @@ +// Package provlog emits structured, single-line JSON log records for +// provisioning-lifecycle boundaries (workspace create, EC2 start/stop, +// restart, idempotency skips). Records share a stable `evt:` prefix and +// JSON payload so a future grep|jq pipeline (or a Loki/Datadog ingest) +// can reconstruct the per-workspace timeline without parsing the +// human-prose log lines that already exist. +// +// Existing log.Printf lines are intentionally NOT replaced — they +// remain the operator-facing message. Event() emits a paired structured +// record alongside, additive only. +// +// Event taxonomy (extend by appending; never rename): +// +// provision.start — workspace row inserted, EC2 about to launch +// provision.skip_existing — idempotency hit, no new EC2 +// provision.ec2_started — RunInstances returned an instance id +// provision.ec2_stopped — TerminateInstances acknowledged +// restart.pre_stop — Restart handler about to call Stop +// +// Required fields per event are documented at each call site. +package provlog + +import ( + "encoding/json" + "log" +) + +// Event writes a single line of the form: +// +// evt: {"k":"v",...} +// +// to the standard logger. JSON encoding errors are silently swallowed — +// a logging helper must never panic the request path. fields may be +// nil; the empty payload `{}` is still useful to mark an event boundary. +func Event(name string, fields map[string]any) { + if fields == nil { + fields = map[string]any{} + } + payload, err := json.Marshal(fields) + if err != nil { + // Fall back to a static payload so the event boundary still + // appears in the log. The marshal error itself is recorded + // on a best-effort basis. + log.Printf("evt: %s {\"_marshal_err\":%q}", name, err.Error()) + return + } + log.Printf("evt: %s %s", name, payload) +} diff --git a/workspace-server/internal/provlog/provlog_test.go b/workspace-server/internal/provlog/provlog_test.go new file mode 100644 index 00000000..7d2f5f5f --- /dev/null +++ b/workspace-server/internal/provlog/provlog_test.go @@ -0,0 +1,97 @@ +package provlog + +import ( + "bytes" + "encoding/json" + "log" + "strings" + "testing" +) + +// captureLog redirects the default logger to a buffer for the duration +// of fn and returns whatever was written. +func captureLog(t *testing.T, fn func()) string { + t.Helper() + var buf bytes.Buffer + prevWriter := log.Writer() + prevFlags := log.Flags() + log.SetOutput(&buf) + log.SetFlags(0) // strip date/time so assertions stay deterministic + t.Cleanup(func() { + log.SetOutput(prevWriter) + log.SetFlags(prevFlags) + }) + fn() + return buf.String() +} + +func TestEvent_EmitsEvtPrefixAndJSONPayload(t *testing.T) { + out := captureLog(t, func() { + Event("provision.start", map[string]any{ + "workspace_id": "ws-123", + "tier": 4, + "runtime": "claude-code", + }) + }) + out = strings.TrimSpace(out) + if !strings.HasPrefix(out, "evt: provision.start ") { + t.Fatalf("expected evt-prefixed line, got %q", out) + } + jsonPart := strings.TrimPrefix(out, "evt: provision.start ") + var got map[string]any + if err := json.Unmarshal([]byte(jsonPart), &got); err != nil { + t.Fatalf("payload not valid JSON: %v (raw=%q)", err, jsonPart) + } + if got["workspace_id"] != "ws-123" { + t.Errorf("workspace_id field lost: %+v", got) + } + // JSON unmarshal turns numbers into float64 — exact-equal compare. + if got["tier"].(float64) != 4 { + t.Errorf("tier field lost: %+v", got) + } + if got["runtime"] != "claude-code" { + t.Errorf("runtime field lost: %+v", got) + } +} + +func TestEvent_NilFieldsEmitsEmptyObject(t *testing.T) { + out := captureLog(t, func() { + Event("restart.pre_stop", nil) + }) + if !strings.Contains(out, "evt: restart.pre_stop {}") { + t.Fatalf("nil fields should emit empty object, got %q", out) + } +} + +func TestEvent_PreservesEventBoundaryOnUnmarshalableValue(t *testing.T) { + // A channel cannot be marshaled by encoding/json — verify we still + // emit the event boundary with a recorded marshal error. This is + // the structural guarantee: the call site never sees a panic, and + // the event name is always present in the log. + out := captureLog(t, func() { + Event("provision.ec2_started", map[string]any{ + "chan": make(chan int), + }) + }) + if !strings.Contains(out, "evt: provision.ec2_started ") { + t.Fatalf("event boundary missing on marshal error: %q", out) + } + if !strings.Contains(out, "_marshal_err") { + t.Fatalf("expected _marshal_err sentinel, got %q", out) + } +} + +func TestEvent_SingleLineOutput(t *testing.T) { + // Log aggregators line-split on \n. A multi-line emit would silently + // fragment the JSON across two records — pin single-line shape. + out := captureLog(t, func() { + Event("provision.skip_existing", map[string]any{ + "existing_id": "ws-abc", + "name": "child-1", + }) + }) + trimmed := strings.TrimRight(out, "\n") + if strings.Contains(trimmed, "\n") { + t.Fatalf("event line must be single-line, got %q", out) + } +}