From e0f9434eafca4bba8cc520223ea13da06da6904d Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 19:58:49 -0700 Subject: [PATCH 01/28] fix(files-eic): silence ssh known-hosts warning that 500'd Hermes config load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GET /workspaces/:id/files/config.yaml on hongming.moleculesai.app's Hermes workspace returned 500 with body: ssh cat: exit status 1 (Warning: Permanently added '[127.0.0.1]:37951' (ED25519) to the list of known hosts.) Root cause: ssh emits the "Permanently added" notice on every fresh tunnel connection, even with UserKnownHostsFile=/dev/null (that prevents persistence, not the warning). It lands on stderr, fooling readFileViaEIC's classifier: if len(out) == 0 && stderr.Len() == 0 { return nil, os.ErrNotExist } return nil, fmt.Errorf("ssh cat: %w (%s)", runErr, ...) stderr was non-empty (the warning), so we returned the wrapped error → 500 from the HTTP layer instead of 404. Fix: add `-o LogLevel=ERROR` to BOTH writeFileViaEIC and readFileViaEIC ssh invocations. Silences info+warning while keeping real auth/tunnel errors visible (those emit at ERROR level). Test: TestSSHArgs_LogLevelErrorBothSites pins the flag in both blocks. Mutation-tested: stripping the flag from one site fails the gate. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/template_files_eic.go | 20 ++++++++++++ .../handlers/template_files_eic_test.go | 32 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/workspace-server/internal/handlers/template_files_eic.go b/workspace-server/internal/handlers/template_files_eic.go index da8a7d14..edce34fc 100644 --- a/workspace-server/internal/handlers/template_files_eic.go +++ b/workspace-server/internal/handlers/template_files_eic.go @@ -178,6 +178,16 @@ func writeFileViaEIC(ctx context.Context, instanceID, runtime, relPath string, c "-i", keyPath, "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", + // LogLevel=ERROR silences the benign "Warning: Permanently + // added '[127.0.0.1]:NNNNN' to known hosts" notice that ssh + // emits on every fresh tunnel connection. Without this, the + // notice lands on stderr and fools readFileViaEIC's "empty + // stdout + empty stderr → file not found" classifier into + // thinking the warning is a real ssh-layer error → 500 + // instead of 404 (Hermes config.yaml load, hongming tenant, + // 2026-05-05 02:38). Real auth/tunnel errors stay visible + // because they're emitted at ERROR level. + "-o", "LogLevel=ERROR", "-o", "ServerAliveInterval=15", "-p", fmt.Sprintf("%d", localPort), fmt.Sprintf("%s@127.0.0.1", osUser), @@ -292,6 +302,16 @@ func readFileViaEIC(ctx context.Context, instanceID, runtime, relPath string) ([ "-i", keyPath, "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", + // LogLevel=ERROR silences the benign "Warning: Permanently + // added '[127.0.0.1]:NNNNN' to known hosts" notice that ssh + // emits on every fresh tunnel connection. Without this, the + // notice lands on stderr and fools readFileViaEIC's "empty + // stdout + empty stderr → file not found" classifier into + // thinking the warning is a real ssh-layer error → 500 + // instead of 404 (Hermes config.yaml load, hongming tenant, + // 2026-05-05 02:38). Real auth/tunnel errors stay visible + // because they're emitted at ERROR level. + "-o", "LogLevel=ERROR", "-o", "ServerAliveInterval=15", "-p", fmt.Sprintf("%d", localPort), fmt.Sprintf("%s@127.0.0.1", osUser), diff --git a/workspace-server/internal/handlers/template_files_eic_test.go b/workspace-server/internal/handlers/template_files_eic_test.go index 30bd9988..e5bc8a48 100644 --- a/workspace-server/internal/handlers/template_files_eic_test.go +++ b/workspace-server/internal/handlers/template_files_eic_test.go @@ -1,6 +1,8 @@ package handlers import ( + "os" + "regexp" "strings" "testing" ) @@ -66,6 +68,36 @@ func TestResolveWorkspaceFilePath_RejectsTraversal(t *testing.T) { } } +// TestSSHArgs_LogLevelErrorBothSites pins that BOTH ssh invocations +// (writeFileViaEIC + readFileViaEIC) include `-o LogLevel=ERROR`. +// +// Without that flag, ssh emits a "Warning: Permanently added +// '[127.0.0.1]:NNNNN' (ED25519) to the list of known hosts." line on +// every fresh tunnel connection (even with UserKnownHostsFile=/dev/null +// — that prevents persistence, not the warning). The warning lands on +// stderr, which fools readFileViaEIC's "empty stdout + empty stderr → +// file not found" classifier into thinking the warning is a real +// ssh-layer error and returning 500 instead of 404. +// +// Caught 2026-05-05 02:38 on hongming.moleculesai.app: opening Hermes +// workspace's Config tab returned 500 with body +// `ssh cat: exit status 1 (Warning: Permanently added '[127.0.0.1]:37951'…)`. +// +// LogLevel=ERROR silences info+warning while keeping real auth/tunnel +// errors visible. This test reads the source and asserts the flag +// appears at least twice (one per ssh block) — fires if a future edit +// removes it from either site. +func TestSSHArgs_LogLevelErrorBothSites(t *testing.T) { + src, err := os.ReadFile("template_files_eic.go") + if err != nil { + t.Fatalf("read source: %v", err) + } + matches := regexp.MustCompile(`"-o", "LogLevel=ERROR"`).FindAllIndex(src, -1) + if len(matches) < 2 { + t.Errorf("expected LogLevel=ERROR in BOTH ssh blocks (write + read); found %d occurrences", len(matches)) + } +} + // TestShellQuote — the sole piece of variable data in the remote ssh // command is the absolute path. It's already built from a map + Clean() // so traversal is impossible, but we still single-quote as defence-in- From c0bfd19b9e3eebd36ec021548c9cc093e1b13fe9 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 19:58:56 -0700 Subject: [PATCH 02/28] feat(external-templates): codex tab uses plain pip install for bridge daemon `codex-channel-molecule` 0.1.0 is now on PyPI, so operators no longer need the `git+https://...` URL workaround. Verified: `pip install codex-channel-molecule` from a clean venv installs the wheel and the `codex-channel-molecule --help` console script runs. PyPI: https://pypi.org/project/codex-channel-molecule/0.1.0/ Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/internal/handlers/external_connection.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/external_connection.go b/workspace-server/internal/handlers/external_connection.go index 249e74a7..a1b94d4c 100644 --- a/workspace-server/internal/handlers/external_connection.go +++ b/workspace-server/internal/handlers/external_connection.go @@ -314,7 +314,7 @@ const externalCodexTemplate = `# Codex external setup — outbound tools (MCP) + # 1. Install codex CLI, the workspace runtime, and the bridge daemon: npm install -g @openai/codex@^0.57 pip install molecule-ai-workspace-runtime -pip install 'git+https://github.com/Molecule-AI/codex-channel-molecule.git' +pip install codex-channel-molecule # 2. Wire the molecule MCP server into codex's config.toml — this is # the OUTBOUND path (codex calls list_peers / delegate_task / From 11c9ed2a46d26f2310260859ba68a64816a2acdc Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:00:23 -0700 Subject: [PATCH 03/28] =?UTF-8?q?fix(provision):=20StopWorkspaceAuto=20mir?= =?UTF-8?q?ror=20=E2=80=94=20close=20SaaS=20EC2-leak=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #2813 (team-collapse) and #2814 (workspace delete). Two leaks, one class. Both call sites had the same shape pre-fix: if h.provisioner != nil { h.provisioner.Stop(ctx, wsID) } On SaaS where h.provisioner (Docker) is nil and h.cpProv is set, that gate evaluates false and the EC2 keeps running. Workspace gets marked removed in DB; EC2 lives on until the orphan sweeper catches it. Same drift class as PR #2811's org-import provision bug — a Docker- only check on what should be a both-backend operation. Confirmed in production: PR #2811's verification step deleted a test workspace and the EC2 stayed running until I terminated it manually. Fix: WorkspaceHandler.StopWorkspaceAuto(ctx, wsID) — symmetric mirror of provisionWorkspaceAuto. CP first, Docker second, no-op when neither is wired (a workspace nobody is running can't be stopped — that's a no-op, not a failure, distinct from provision's mark-failed contract). Three call-site changes: - team.go:208 (Collapse) → h.wh.StopWorkspaceAuto(ctx, childID) - workspace_crud.go:432 (stopAndRemove) → h.StopWorkspaceAuto(...); RemoveVolume stays Docker-only behind an explicit gate since CP-managed workspaces have no host-bind volumes - TeamHandler.provisioner field + NewTeamHandler's *Provisioner param removed as dead code (Stop was the only call site) Volume cleanup separation is intentional: the abstraction is "stop the running workload," not "tear down all state." Callers that need volume cleanup keep their `if h.provisioner != nil { RemoveVolume }` gate AFTER the Stop call. Tests: - TestStopWorkspaceAuto_RoutesToCPWhenSet — SaaS path - TestStopWorkspaceAuto_RoutesToDockerWhenOnlyDocker — self-hosted - TestStopWorkspaceAuto_NoBackendIsNoOp — pins the contract distinction from provisionWorkspaceAuto's mark-failed - TestNoCallSiteCallsBareStop — source-level pin against `.provisioner.Stop(` / `.cpProv.Stop(` outside the dispatcher, per-backend bodies, restart helper, and the Docker-daemon-direct short-lived-container path. Strips Go comments before substring match so archaeology in code comments doesn't trip the gate. - Verified: pin FAILS against the buggy shape (workspace_crud.go reversion); team.go reversion compile-fails because the field is gone — even stronger than the test. Out of scope (tracked under #2799): - workspace_restart.go's manual if-cpProv-else dispatch with retry semantics tuned for the restart hot path. Functionally equivalent + wraps cpStopWithRetry, so it's not the bug class this PR closes. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/internal/handlers/team.go | 29 ++- .../internal/handlers/team_test.go | 12 +- .../internal/handlers/workspace.go | 37 ++++ .../internal/handlers/workspace_crud.go | 37 ++-- .../handlers/workspace_provision_auto_test.go | 208 +++++++++++++++++- workspace-server/internal/router/router.go | 2 +- 6 files changed, 288 insertions(+), 37 deletions(-) diff --git a/workspace-server/internal/handlers/team.go b/workspace-server/internal/handlers/team.go index c4a481f9..0ac5aac4 100644 --- a/workspace-server/internal/handlers/team.go +++ b/workspace-server/internal/handlers/team.go @@ -11,7 +11,6 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" - "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" "github.com/gin-gonic/gin" "github.com/google/uuid" "gopkg.in/yaml.v3" @@ -25,28 +24,21 @@ import ( // NULL auth_token — same drift class as the SaaS bug fixed in #2366. type TeamHandler struct { broadcaster *events.Broadcaster - // provisioner is interface-typed (#2369) for the same reason as - // WorkspaceHandler.provisioner — Stop is the only call site here - // and it's on the LocalProvisionerAPI surface, so widening is free - // and symmetric with WorkspaceHandler. - provisioner provisioner.LocalProvisionerAPI wh *WorkspaceHandler platformURL string configsDir string } -func NewTeamHandler(b *events.Broadcaster, p *provisioner.Provisioner, wh *WorkspaceHandler, platformURL, configsDir string) *TeamHandler { - h := &TeamHandler{ +// NewTeamHandler constructs a TeamHandler. Backend selection (Docker vs +// CP) goes through h.wh.StopWorkspaceAuto + h.wh.provisionWorkspaceAuto; +// no per-handler provisioner field is needed here. +func NewTeamHandler(b *events.Broadcaster, wh *WorkspaceHandler, platformURL, configsDir string) *TeamHandler { + return &TeamHandler{ broadcaster: b, wh: wh, platformURL: platformURL, configsDir: configsDir, } - // Avoid the typed-nil interface trap (see NewWorkspaceHandler note). - if p != nil { - h.provisioner = p - } - return h } // Expand handles POST /workspaces/:id/expand @@ -203,9 +195,14 @@ func (h *TeamHandler) Collapse(c *gin.Context) { continue } - // Stop container if provisioner available - if h.provisioner != nil { - h.provisioner.Stop(ctx, childID) + // Stop the workload via the backend dispatcher (CP for SaaS, + // Docker for self-hosted). Pre-2026-05-05 this was + // `if h.provisioner != nil { h.provisioner.Stop(...) }`, which + // silently skipped on every SaaS tenant — child EC2s kept running + // after team-collapse until the orphan sweeper caught them + // (issue #2813). + if err := h.wh.StopWorkspaceAuto(ctx, childID); err != nil { + log.Printf("Team collapse: stop %s failed: %v — orphan sweeper will reconcile", childID, err) } // Mark as removed diff --git a/workspace-server/internal/handlers/team_test.go b/workspace-server/internal/handlers/team_test.go index e909308d..1967ee1f 100644 --- a/workspace-server/internal/handlers/team_test.go +++ b/workspace-server/internal/handlers/team_test.go @@ -34,7 +34,7 @@ func TestTeamCollapse_NoChildren(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) broadcaster := newTestBroadcaster() - handler := NewTeamHandler(broadcaster, nil, nil, "http://localhost:8080", "/tmp/configs") + handler := NewTeamHandler(broadcaster, NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", "/tmp/configs") // No children mock.ExpectQuery("SELECT id, name FROM workspaces WHERE parent_id"). @@ -66,7 +66,7 @@ func TestTeamCollapse_WithChildren(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) broadcaster := newTestBroadcaster() - handler := NewTeamHandler(broadcaster, nil, nil, "http://localhost:8080", "/tmp/configs") + handler := NewTeamHandler(broadcaster, NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", "/tmp/configs") // Two children mock.ExpectQuery("SELECT id, name FROM workspaces WHERE parent_id"). @@ -122,7 +122,7 @@ func TestTeamCollapse_WithChildren(t *testing.T) { func TestTeamExpand_WorkspaceNotFound(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) - handler := NewTeamHandler(newTestBroadcaster(), nil, nil, "http://localhost:8080", "/tmp/configs") + handler := NewTeamHandler(newTestBroadcaster(), NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", "/tmp/configs") mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id"). WithArgs("ws-missing"). @@ -143,7 +143,7 @@ func TestTeamExpand_WorkspaceNotFound(t *testing.T) { func TestTeamExpand_NoConfigFound(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) - handler := NewTeamHandler(newTestBroadcaster(), nil, nil, "http://localhost:8080", t.TempDir()) + handler := NewTeamHandler(newTestBroadcaster(), NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", t.TempDir()) mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id"). WithArgs("ws-1"). @@ -167,7 +167,7 @@ func TestTeamExpand_EmptySubWorkspaces(t *testing.T) { setupTestRedis(t) configDir := makeTeamConfigDir(t, "myagent", "name: MyAgent\nsub_workspaces: []\n") - handler := NewTeamHandler(newTestBroadcaster(), nil, nil, "http://localhost:8080", configDir) + handler := NewTeamHandler(newTestBroadcaster(), NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", configDir) mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id"). WithArgs("ws-1"). @@ -199,7 +199,7 @@ sub_workspaces: role: code-reviewer ` configDir := makeTeamConfigDir(t, "teamlead", yaml) - handler := NewTeamHandler(broadcaster, nil, nil, "http://localhost:8080", configDir) + handler := NewTeamHandler(broadcaster, NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()), "http://localhost:8080", configDir) mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id"). WithArgs("ws-lead"). diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 27051e23..8d25ed6e 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -170,6 +170,43 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri return false } +// StopWorkspaceAuto picks the backend (CP for SaaS, local Docker for +// self-hosted) and stops the workspace synchronously. Returns nil when +// neither backend is wired (a workspace nobody is running can't be +// stopped — that's a no-op, not an error). +// +// Single source of truth for "stop a workspace" — symmetric with +// provisionWorkspaceAuto. Pre-2026-05-05 the stop side had no Auto +// dispatcher and every caller wrote `if h.provisioner != nil { Stop }`, +// which silently leaked EC2s on SaaS: +// - team.go:208 (Collapse) — issue #2813 +// - workspace_crud.go:432 (stopAndRemove during Delete) — issue #2814 +// +// Both bugs reproduced for ~6 months. The pattern is the same drift +// class as the org-import provision bug closed by PR #2811. +// +// Why CP wins when both are wired (matching provisionWorkspaceAuto): +// production runs exactly one backend at a time — a SaaS tenant has +// cpProv set + provisioner nil; a self-hosted operator has provisioner +// set + cpProv nil. The "both set" case only arises in test fixtures, +// and the CP-wins ordering matches how Auto picks for provisioning so +// the test stubs stay on a single side. +// +// Volume cleanup (workspace_crud.go) stays Docker-only — CP-managed +// workspaces have no volumes to clean. Callers that need that extra +// step keep their `if h.provisioner != nil { RemoveVolume(...) }` +// gate AFTER calling StopWorkspaceAuto. The abstraction here is "stop +// the running workload," not "tear down all state." +func (h *WorkspaceHandler) StopWorkspaceAuto(ctx context.Context, workspaceID string) error { + if h.cpProv != nil { + return h.cpProv.Stop(ctx, workspaceID) + } + if h.provisioner != nil { + return h.provisioner.Stop(ctx, workspaceID) + } + return nil +} + // SetEnvMutators wires a provisionhook.Registry into the handler. Plugins // living in separate repos register on the same Registry instance during // boot (see cmd/server/main.go) and main.go calls this setter once before diff --git a/workspace-server/internal/handlers/workspace_crud.go b/workspace-server/internal/handlers/workspace_crud.go index f254ea86..4e58804f 100644 --- a/workspace-server/internal/handlers/workspace_crud.go +++ b/workspace-server/internal/handlers/workspace_crud.go @@ -420,22 +420,33 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) { var stopErrs []error stopAndRemove := func(wsID string) { - if h.provisioner == nil { - return - } - // Check Stop's error before attempting RemoveVolume — the - // previous code discarded it and immediately tried the - // volume remove, which always fails with "volume in use" - // when Stop didn't actually kill the container. The orphan - // sweeper (registry/orphan_sweeper.go) catches what we - // skip here on the next reconcile pass. - if err := h.provisioner.Stop(cleanupCtx, wsID); err != nil { - log.Printf("Delete %s container stop failed: %v — leaving volume for orphan sweeper", wsID, err) + // Stop the workload first via the backend dispatcher (CP for + // SaaS, Docker for self-hosted). Pre-2026-05-05 this gate was + // `if h.provisioner == nil { return }` — early-returning on + // every SaaS tenant left the EC2 running with no DB row to + // track it (issue #2814; the comment below claimed "loud-fail + // instead of silent-leak" but the early-return made it the + // silent path on SaaS). + // + // Check Stop's error before any volume cleanup — the previous + // code discarded it and immediately tried RemoveVolume, which + // always fails with "volume in use" when Stop didn't actually + // kill the container. The orphan sweeper + // (registry/orphan_sweeper.go) catches what we skip here on + // the next reconcile pass. + if err := h.StopWorkspaceAuto(cleanupCtx, wsID); err != nil { + log.Printf("Delete %s stop failed: %v — leaving cleanup for orphan sweeper", wsID, err) stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", wsID, err)) return } - if err := h.provisioner.RemoveVolume(cleanupCtx, wsID); err != nil { - log.Printf("Delete %s volume removal warning: %v", wsID, err) + // Volume cleanup is Docker-only — CP-managed workspaces have + // no host-bind volumes to remove. Skip silently when no Docker + // provisioner is wired (the SaaS path already terminated the + // EC2 above; nothing left to do). + if h.provisioner != nil { + if err := h.provisioner.RemoveVolume(cleanupCtx, wsID); err != nil { + log.Printf("Delete %s volume removal warning: %v", wsID, err) + } } } diff --git a/workspace-server/internal/handlers/workspace_provision_auto_test.go b/workspace-server/internal/handlers/workspace_provision_auto_test.go index 9198e8f6..ff9bdede 100644 --- a/workspace-server/internal/handlers/workspace_provision_auto_test.go +++ b/workspace-server/internal/handlers/workspace_provision_auto_test.go @@ -41,7 +41,9 @@ import ( type trackingCPProv struct { mu sync.Mutex started []string + stopped []string startErr error + stopErr error } func (r *trackingCPProv) Start(_ context.Context, cfg provisioner.WorkspaceConfig) (string, error) { @@ -53,12 +55,25 @@ func (r *trackingCPProv) Start(_ context.Context, cfg provisioner.WorkspaceConfi } return "i-stub-" + cfg.WorkspaceID, nil } -func (r *trackingCPProv) Stop(_ context.Context, _ string) error { return nil } +func (r *trackingCPProv) Stop(_ context.Context, workspaceID string) error { + r.mu.Lock() + r.stopped = append(r.stopped, workspaceID) + r.mu.Unlock() + return r.stopErr +} func (r *trackingCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) { return "", nil } func (r *trackingCPProv) IsRunning(_ context.Context, _ string) (bool, error) { return true, nil } +func (r *trackingCPProv) stoppedSnapshot() []string { + r.mu.Lock() + defer r.mu.Unlock() + out := make([]string, len(r.stopped)) + copy(out, r.stopped) + return out +} + func (r *trackingCPProv) startedSnapshot() []string { r.mu.Lock() defer r.mu.Unlock() @@ -432,3 +447,194 @@ func TestOrgImportGate_UsesHasProvisionerNotBareField(t *testing.T) { t.Errorf("org_import.go must call h.workspace.HasProvisioner() in the provisioning gate — current code does not") } } + +// TestStopWorkspaceAuto_RoutesToCPWhenSet — symmetric with the +// provision dispatcher test above. SaaS tenants run with cpProv set +// and the local Docker provisioner nil; Auto must route Stop to CP +// (= terminate the EC2). Pre-2026-05-05 the absence of this dispatcher +// meant team-collapse + workspace-delete called h.provisioner.Stop +// directly, no-oping on every SaaS tenant — issue #2813 (collapse) and +// #2814 (delete) both leak EC2s for ~6 months. +func TestStopWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) { + rec := &trackingCPProv{} + bcast := &concurrentSafeBroadcaster{} + h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir()) + h.SetCPProvisioner(rec) + + wsID := "ws-stop-routes-cp" + if err := h.StopWorkspaceAuto(context.Background(), wsID); err != nil { + t.Fatalf("StopWorkspaceAuto returned err with CP wired: %v", err) + } + got := rec.stoppedSnapshot() + if len(got) != 1 || got[0] != wsID { + t.Errorf("expected cpProv.Stop invoked once with %q, got %v", wsID, got) + } +} + +// TestStopWorkspaceAuto_RoutesToDockerWhenOnlyDocker — self-hosted +// operators run with the local Docker provisioner wired and cpProv nil. +// Auto must route to Docker. +// +// Stub-injects a LocalProvisionerAPI via a private constructor pattern +// so we don't need a real Docker daemon. NewWorkspaceHandler's +// constructor takes *provisioner.Provisioner (concrete) so we set the +// interface field directly. +func TestStopWorkspaceAuto_RoutesToDockerWhenOnlyDocker(t *testing.T) { + bcast := &concurrentSafeBroadcaster{} + h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir()) + stub := &stoppingLocalProv{} + h.provisioner = stub + + wsID := "ws-stop-routes-docker" + if err := h.StopWorkspaceAuto(context.Background(), wsID); err != nil { + t.Fatalf("StopWorkspaceAuto returned err with Docker wired: %v", err) + } + if len(stub.stopped) != 1 || stub.stopped[0] != wsID { + t.Errorf("expected Docker provisioner.Stop invoked once with %q, got %v", wsID, stub.stopped) + } +} + +// TestStopWorkspaceAuto_NoBackendIsNoOp — when neither backend is wired +// (misconfigured deployment, or test fixture), StopWorkspaceAuto returns +// nil silently. Distinct from provisionWorkspaceAuto's mark-failed +// behavior: there's no row state to mark "failed to stop" against, and +// the absence of a backend means nothing was running to stop. +func TestStopWorkspaceAuto_NoBackendIsNoOp(t *testing.T) { + bcast := &concurrentSafeBroadcaster{} + h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir()) + // Neither SetCPProvisioner nor a Docker provisioner — both nil. + + if err := h.StopWorkspaceAuto(context.Background(), "ws-noback"); err != nil { + t.Errorf("expected nil error on no-backend stop, got %v", err) + } +} + +// stoppingLocalProv is a minimal LocalProvisionerAPI stub that records +// Stop invocations. Other methods panic — guards against accidental +// use by tests that should be using a different stub. +type stoppingLocalProv struct { + stopped []string +} + +func (s *stoppingLocalProv) Stop(_ context.Context, workspaceID string) error { + s.stopped = append(s.stopped, workspaceID) + return nil +} +func (s *stoppingLocalProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) { + panic("stoppingLocalProv: Start not implemented for this test") +} +func (s *stoppingLocalProv) IsRunning(_ context.Context, _ string) (bool, error) { + panic("stoppingLocalProv: IsRunning not implemented for this test") +} +func (s *stoppingLocalProv) ExecRead(_ context.Context, _, _ string) ([]byte, error) { + panic("stoppingLocalProv: ExecRead not implemented for this test") +} +func (s *stoppingLocalProv) RemoveVolume(_ context.Context, _ string) error { + panic("stoppingLocalProv: RemoveVolume not implemented for this test") +} +func (s *stoppingLocalProv) VolumeHasFile(_ context.Context, _, _ string) (bool, error) { + panic("stoppingLocalProv: VolumeHasFile not implemented for this test") +} +func (s *stoppingLocalProv) WriteAuthTokenToVolume(_ context.Context, _, _ string) error { + panic("stoppingLocalProv: WriteAuthTokenToVolume not implemented for this test") +} + +// TestNoCallSiteCallsBareStop — source-level pin against the bug +// pattern that motivated this PR. Any non-test handler that wants to +// "stop the workload" must go through h.X.StopWorkspaceAuto, not bare +// h.X.provisioner.Stop / h.X.cpProv.Stop / h.X.Stop. Pre-2026-05-05 +// team.go and workspace_crud.go both called h.provisioner.Stop directly +// inside `if h.provisioner != nil { ... }` gates — silent no-op on +// SaaS, EC2 leak (#2813, #2814). +// +// Allowed exceptions: +// - workspace.go: defines StopWorkspaceAuto (the dispatcher itself). +// - workspace_provision.go: defines per-backend Start/Stop bodies. +// - workspace_restart.go: pre-dates the dispatchers and uses manual +// if-cpProv-else dispatch with retry semantics tuned for the +// restart hot path. Functionally equivalent + wraps cpStopWithRetry, +// so it's not the bug class this gate targets — but it IS +// architectural duplication, tracked under #2799. +// - container_files.go: drives Docker daemon directly for file-copy +// short-lived containers; no workspace-level Stop semantics. +func TestNoCallSiteCallsBareStop(t *testing.T) { + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + entries, err := os.ReadDir(wd) + if err != nil { + t.Fatalf("readdir: %v", err) + } + bareShapes := []string{ + ".provisioner.Stop(", + ".cpProv.Stop(", + } + allowedFiles := map[string]bool{ + "workspace.go": true, + "workspace_provision.go": true, + "workspace_restart.go": true, + "container_files.go": true, + } + for _, entry := range entries { + name := entry.Name() + if filepath.Ext(name) != ".go" { + continue + } + if len(name) > len("_test.go") && + name[len(name)-len("_test.go"):] == "_test.go" { + continue + } + if allowedFiles[name] { + continue + } + src, err := os.ReadFile(filepath.Join(wd, name)) + if err != nil { + t.Fatalf("read %s: %v", name, err) + } + // Strip line + block comments before substring check — the gate + // targets call expressions in real code, not historical + // references in documentation/comments. Without this, comments + // describing the old buggy shape (kept on purpose for + // archaeology) trip the test. + stripped := stripGoComments(src) + for _, needle := range bareShapes { + if bytes.Contains(stripped, []byte(needle)) { + t.Errorf("%s contains bare `%s` — must go through h.X.StopWorkspaceAuto so SaaS tenants route to CP. "+ + "Pre-2026-05-05 team.go and workspace_crud.go did this and silently leaked EC2s on every SaaS collapse / delete (#2813, #2814).", name, needle) + } + } + } +} + +// stripGoComments removes // line comments and /* */ block comments +// from Go source. Imperfect (doesn't handle comments-inside-strings) +// but adequate for the source-level pin tests in this file — none of +// our gated needles legitimately appear inside string literals in the +// handlers package. +func stripGoComments(src []byte) []byte { + out := make([]byte, 0, len(src)) + for i := 0; i < len(src); i++ { + // Block comment + if i+1 < len(src) && src[i] == '/' && src[i+1] == '*' { + i += 2 + for i+1 < len(src) && !(src[i] == '*' && src[i+1] == '/') { + i++ + } + i++ // skip closing / + continue + } + // Line comment — preserve the newline so line counts stay sane + if i+1 < len(src) && src[i] == '/' && src[i+1] == '/' { + for i < len(src) && src[i] != '\n' { + i++ + } + if i < len(src) { + out = append(out, '\n') + } + continue + } + out = append(out, src[i]) + } + return out +} diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index 516aa99d..bd542e56 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -230,7 +230,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi r.GET("/approvals/pending", middleware.AdminAuth(db.DB), apph.ListAll) // Team Expansion - teamh := handlers.NewTeamHandler(broadcaster, prov, wh, platformURL, configsDir) + teamh := handlers.NewTeamHandler(broadcaster, wh, platformURL, configsDir) wsAuth.POST("/expand", teamh.Expand) wsAuth.POST("/collapse", teamh.Collapse) From beab899501db790968891eecf77414c34b5f4e00 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:02:05 -0700 Subject: [PATCH 04/28] feat(ConfigTab): drop Skills/Tools tag inputs, give Prompt Files its own section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User feedback (2026-05-04 conversation): > "Skills and Tools are having their own tab as plugin, and Prompt > Files are in the file system which can be directly edited. Am I > missing something?" > "Tools should be merged into plugin then, and for prompt files... it > should be in another section than in skill& tools" The "Skills & Tools" section in ConfigTab had three TagList inputs: - Skills: managed via the dedicated SkillsTab (per-workspace skill folders) — duplicate UI affordance - Tools: managed via the Plugins tab (install a plugin → its tools become available) — duplicate UI affordance - Prompt Files: load order for system-prompt files — semantically unrelated to skills/tools Drop the Skills + Tools inputs. Move Prompt Files into its own section with explanatory copy that names the auto-loaded files (system-prompt.md, CLAUDE.md, AGENTS.md) and points users at the Files tab for actual editing. Schema fields `config.skills` and `config.tools` are KEPT (load-bearing for runtime skill loading + tool registry); only the inline editor goes away. Operators who need to edit them can still use the Raw YAML toggle. Tests: - New ConfigTab.sections.test.tsx with 4 cases: 1. "Skills & Tools" section title is gone 2. Skills tag input is absent 3. Tools tag input is absent 4. Prompt Files section exists with explanatory copy Sibling ConfigTab tests (hermes, provider) all still pass (20/20). Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/tabs/ConfigTab.tsx | 22 ++- .../__tests__/ConfigTab.sections.test.tsx | 125 ++++++++++++++++++ 2 files changed, 143 insertions(+), 4 deletions(-) create mode 100644 canvas/src/components/tabs/__tests__/ConfigTab.sections.test.tsx diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx index fdb8eb38..9447d09a 100644 --- a/canvas/src/components/tabs/ConfigTab.tsx +++ b/canvas/src/components/tabs/ConfigTab.tsx @@ -886,10 +886,24 @@ export function ConfigTab({ workspaceId }: Props) { )} -
- update("skills", v)} placeholder="e.g. code-review" /> - update("tools", v)} placeholder="e.g. web_search, filesystem" /> - update("prompt_files", v)} placeholder="e.g. system-prompt.md" /> + {/* Skills + Tools used to live here as TagList inputs. They were + redundant with their dedicated tabs: + - Skills → managed via SkillsTab (per-workspace skill folders) + - Tools → managed via the Plugins tab (install/uninstall) + Editing them here only set the config.yaml field; the + actual install/load happened elsewhere. Removed to stop + showing the misnamed list-input affordance. */} + +
+

+ Markdown files that compose this workspace's system prompt. + Loaded in order at boot from the workspace config dir + (e.g. system-prompt.md,{' '} + CLAUDE.md,{' '} + AGENTS.md). Edit the file + contents directly via the Files tab. +

+ update("prompt_files", v)} placeholder="e.g. system-prompt.md" />
diff --git a/canvas/src/components/tabs/__tests__/ConfigTab.sections.test.tsx b/canvas/src/components/tabs/__tests__/ConfigTab.sections.test.tsx new file mode 100644 index 00000000..2e083fb3 --- /dev/null +++ b/canvas/src/components/tabs/__tests__/ConfigTab.sections.test.tsx @@ -0,0 +1,125 @@ +// @vitest-environment jsdom +// +// Regression tests for the ConfigTab section restructure (user feedback +// 2026-05-04: "Skills and Tools are having their own tab as plugin, and +// Prompt Files are in the file system which can be directly edited. Am +// I missing something?" + "Tools should be merged into plugin then, and +// for prompt files... should be in another section than in skill& tools"). +// +// What this pins: +// 1. The "Skills & Tools" section title is gone. +// 2. Editable Skills + Tools tag inputs are gone (managed elsewhere). +// 3. A dedicated "Prompt Files" section exists with explanatory text. +// +// If a future PR re-adds the Skills/Tools tag inputs to ConfigTab, this +// suite catches it. + +import { describe, it, expect, vi, afterEach, beforeEach } from "vitest"; +import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react"; +import React from "react"; + +afterEach(cleanup); + +const apiGet = vi.fn(); +vi.mock("@/lib/api", () => ({ + api: { + get: (path: string) => apiGet(path), + patch: vi.fn(), + put: vi.fn(), + post: vi.fn(), + del: vi.fn(), + }, +})); + +const storeUpdateNodeData = vi.fn(); +const storeRestartWorkspace = vi.fn(); +vi.mock("@/store/canvas", () => ({ + useCanvasStore: Object.assign( + (selector: (s: unknown) => unknown) => + selector({ restartWorkspace: storeRestartWorkspace, updateNodeData: storeUpdateNodeData }), + { + getState: () => ({ + restartWorkspace: storeRestartWorkspace, + updateNodeData: storeUpdateNodeData, + }), + }, + ), +})); + +vi.mock("../AgentCardSection", () => ({ + AgentCardSection: () =>
, +})); + +import { ConfigTab } from "../ConfigTab"; + +beforeEach(() => { + apiGet.mockReset(); + apiGet.mockImplementation((path: string) => { + if (path === `/workspaces/ws-test`) { + return Promise.resolve({ runtime: "claude-code" }); + } + if (path === `/workspaces/ws-test/model`) { + return Promise.resolve({ model: "claude-opus-4-7" }); + } + if (path === `/workspaces/ws-test/provider`) { + return Promise.resolve({ provider: "anthropic-oauth", source: "default" }); + } + if (path === `/workspaces/ws-test/files/config.yaml`) { + return Promise.resolve({ content: "name: test\nruntime: claude-code\n" }); + } + if (path === "/templates") { + return Promise.resolve([ + { id: "claude-code", name: "Claude Code", runtime: "claude-code", providers: [] }, + ]); + } + return Promise.reject(new Error(`unmocked api.get: ${path}`)); + }); +}); + +describe("ConfigTab section restructure", () => { + it("does not render a 'Skills & Tools' section title", async () => { + render(); + await waitFor(() => expect(apiGet).toHaveBeenCalled()); + // Section button uses the title as its accessible name; should be absent. + expect(screen.queryByRole("button", { name: /Skills\s*&\s*Tools/i })).toBeNull(); + }); + + it("does not render an editable Skills tag input", async () => { + render(); + await waitFor(() => expect(apiGet).toHaveBeenCalled()); + // TagList renders its label; check no input labelled "Skills" in the form. + // (Skills are managed via the dedicated Skills tab.) + const skillsLabels = screen + .queryAllByText(/^Skills$/) + .filter((el) => el.tagName.toLowerCase() === "label"); + expect(skillsLabels).toHaveLength(0); + }); + + it("does not render an editable Tools tag input", async () => { + render(); + await waitFor(() => expect(apiGet).toHaveBeenCalled()); + // Tools are managed via the Plugins tab — install a plugin → its tools + // become available. No reason to type tool names here. + const toolsLabels = screen + .queryAllByText(/^Tools$/) + .filter((el) => el.tagName.toLowerCase() === "label"); + expect(toolsLabels).toHaveLength(0); + }); + + it("renders a dedicated 'Prompt Files' section with explanatory copy", async () => { + render(); + await waitFor(() => expect(apiGet).toHaveBeenCalled()); + // Section is collapsed by default — find + expand first. + const sectionButton = screen.getByRole("button", { name: /Prompt Files/i }); + expect(sectionButton).toBeTruthy(); + fireEvent.click(sectionButton); + // Explanatory copy mentions system-prompt.md (split across tags + // so use textContent on any element rather than the default text matcher). + await waitFor(() => { + const matches = screen.queryAllByText((_, el) => + (el?.textContent || "").includes("system-prompt.md"), + ); + expect(matches.length).toBeGreaterThan(0); + }); + }); +}); From 7cc1c39c49c97d504b62ebaaff982d75c6925bcd Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:21:59 -0700 Subject: [PATCH 05/28] ci: e2e coverage matrix + branch-protection-as-code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #9. Three pieces, all small: 1. **docs/e2e-coverage.md** — source of truth for which E2E suites guard which surfaces. Today three were running but informational only on staging; that's how the org-import silent-drop bug shipped without a test catching it pre-merge. Now the matrix shows what's required where + a follow-up note for the two suites that need an always-emit refactor before they can be required. 2. **tools/branch-protection/apply.sh** — branch protection as code. Lets `staging` and `main` required-checks live in a reviewable shell script instead of UI clicks that get lost between admins. This PR's net change: add `E2E API Smoke Test` and `Canvas tabs E2E` as required on staging. Both already use the always-emit path-filter pattern (no-op step emits SUCCESS when the workflow's paths weren't touched), so making them required can't deadlock unrelated PRs. 3. **branch-protection-drift.yml** — daily cron + drift_check.sh that compares live protection against apply.sh's desired state. Catches out-of-band UI edits before they drift further. Fails the workflow on mismatch; ops re-runs apply.sh or updates the script. Out of scope (filed as follow-ups): - e2e-staging-saas + e2e-staging-external use plain `paths:` filters and never trigger when paths are unchanged. They need refactoring to the always-emit shape (same as e2e-api / e2e-staging-canvas) before they can be required. - main branch protection mirrors staging here; if main wants the E2E SaaS / External added later, do it in apply.sh and rerun. Operator must apply once after merge: bash tools/branch-protection/apply.sh The drift check picks it up from there. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/branch-protection-drift.yml | 42 ++++++ docs/e2e-coverage.md | 58 ++++++++ tools/branch-protection/apply.sh | 133 ++++++++++++++++++ tools/branch-protection/drift_check.sh | 44 ++++++ 4 files changed, 277 insertions(+) create mode 100644 .github/workflows/branch-protection-drift.yml create mode 100644 docs/e2e-coverage.md create mode 100755 tools/branch-protection/apply.sh create mode 100755 tools/branch-protection/drift_check.sh diff --git a/.github/workflows/branch-protection-drift.yml b/.github/workflows/branch-protection-drift.yml new file mode 100644 index 00000000..7b05f09f --- /dev/null +++ b/.github/workflows/branch-protection-drift.yml @@ -0,0 +1,42 @@ +name: branch-protection drift check + +# Catches out-of-band edits to branch protection (UI clicks, manual gh +# api PATCH from a one-off ops session) by comparing live state against +# tools/branch-protection/apply.sh's desired state every day. Fails the +# workflow when they drift; the failure is the signal. +# +# When it fails: re-run apply.sh to put the live state back to the +# script's intent, OR update apply.sh to encode the new intent and +# commit. Either way the script is the source of truth. + +on: + schedule: + # 14:00 UTC daily. Off-hours for most teams; gives a fresh signal + # at the start of every working day. + - cron: '0 14 * * *' + workflow_dispatch: + pull_request: + branches: [staging, main] + paths: + - 'tools/branch-protection/**' + - '.github/workflows/branch-protection-drift.yml' + +permissions: + contents: read + +jobs: + drift: + name: Branch protection drift + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Run drift check + env: + # GH_TOKEN_FOR_ADMIN_API — repo-admin scope, needed for the + # /branches/:b/protection endpoint. Falls back to GITHUB_TOKEN + # for read-only mode (gh api on the protection endpoint + # returns 200 to maintainers, 404 to read-only). Configure at + # Settings → Secrets and variables → Actions. + GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API || secrets.GITHUB_TOKEN }} + run: bash tools/branch-protection/drift_check.sh diff --git a/docs/e2e-coverage.md b/docs/e2e-coverage.md new file mode 100644 index 00000000..48ca0e19 --- /dev/null +++ b/docs/e2e-coverage.md @@ -0,0 +1,58 @@ +# E2E coverage matrix + +This document is the source of truth for which E2E suites guard which surfaces and which gates are wired up where. Read this before adding a new E2E or moving a check between branches. + +## Suites + +| Workflow file | Job (= required-check name) | What it covers | Cron | +|---|---|---|---| +| `e2e-api.yml` | `E2E API Smoke Test` | A2A handshake, registry/register, /workspaces/:id/a2a forward, structured-event emission. Lightweight enough to run on every PR. | — | +| `e2e-staging-canvas.yml` | `Canvas tabs E2E` | Canvas-tab Playwright UX checks against staging — config tab, secrets tab, agent-card tab, Activity hydration. | weekly Sun 08:00 UTC | +| `e2e-staging-saas.yml` | `E2E Staging SaaS` | Full lifecycle: org creation → workspace provision (CP path) → A2A delegation → status/heartbeat → workspace delete → EC2 termination. The integration test that catches the silent-drop bug class (#2486 / #2811 / #2813 / #2814). | daily 07:00 UTC | +| `e2e-staging-external.yml` | `E2E Staging External Runtime` | External-runtime registration + heartbeat staleness sweep + `/registry/peers` resolution. Validates the OSS-templated workspace path. | daily 07:30 UTC | +| `e2e-staging-sanity.yml` | `Intentional-failure teardown sanity` | Inverted assertion — the run MUST fail. Validates the leak-detection self-check itself; not for general gating. | weekly Mon 06:00 UTC | +| `continuous-synth-e2e.yml` | `Synthetic E2E against staging` | Standing background coverage between PR runs. Catches drift in production-like staging that PR-time E2Es miss. | every 15 min | + +## Required-check status (branch protection) + +| Suite | staging required | main required | +|---|---|---| +| `E2E API Smoke Test` | ✅ this PR | ✅ | +| `Canvas tabs E2E` | ✅ this PR | (see follow-up) | +| `E2E Staging SaaS` | ❌ — needs always-emit refactor | ❌ | +| `E2E Staging External Runtime` | ❌ — needs always-emit refactor | ❌ | +| `Intentional-failure teardown sanity` | ❌ inverted assertion, never required | ❌ | +| `Synthetic E2E against staging` | ❌ cron-only, not a per-PR gate | ❌ | + +## Why the always-emit pattern matters + +Branch protection requires a *check name* to land at SUCCESS for every PR. Workflows with `paths:` filters that exclude a PR never run, so the check name never appears, and the PR sits BLOCKED forever. + +The pattern that supports being required is: + +1. Workflow always triggers on push/PR to the protected branch. +2. A `detect-changes` job uses `dorny/paths-filter` to decide if real work runs. +3. The protected job runs unconditionally and either (a) does real work when paths matched, or (b) emits a no-op SUCCESS step when paths skipped. + +`e2e-api.yml` and `e2e-staging-canvas.yml` already have this shape. `e2e-staging-saas.yml` and `e2e-staging-external.yml` use plain `paths:` filters and need the refactor before they can be required (filed as follow-up). + +## Adding a new E2E suite + +1. Pick a verb: smoke test, full lifecycle, fault-injection, drift detection. Pre-existing suites split along these lines. +2. Use the always-emit shape so the check name can be made required. +3. Add a row to the matrix above. +4. Decide cron cadence based on cost + how fast drift would otherwise be caught. +5. If you want it required, add to the relevant branch protection via `tools/branch-protection/apply.sh` (this PR adds the script). + +## When to break glass — temporarily skip a required E2E + +Don't. If an E2E is intermittently flaky, fix the test or move it out of required. The point of a required check is that it's load-bearing; bypassing one with admin override teaches the next operator the gate is optional. + +If a Production incident requires bypassing, document the override in the incident postmortem with a same-week followup to either fix the test or rip the check out of required. + +## Related issues / PRs + +- #2486 — silent-drop bug class that the SaaS E2E now catches +- PR #2811 — `provisionWorkspaceAuto` consolidation (org-import SaaS gate) +- PR #2824 — `StopWorkspaceAuto` mirror (closes #2813 + #2814) +- Follow-up: refactor `e2e-staging-saas` + `e2e-staging-external` to always-emit (so they can be required) diff --git a/tools/branch-protection/apply.sh b/tools/branch-protection/apply.sh new file mode 100755 index 00000000..928d2829 --- /dev/null +++ b/tools/branch-protection/apply.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# tools/branch-protection/apply.sh — idempotently apply branch +# protection to molecule-core's `staging` and `main` branches. +# +# Why a script: GitHub's branch protection lives in repo settings, so +# changes are usually clicked through the UI and lost between admins. +# This script makes the config reproducible — diff it against the live +# state, change the file, run it, done. Single source of truth that +# shows up in code review. +# +# Usage: +# tools/branch-protection/apply.sh # apply both branches +# tools/branch-protection/apply.sh --dry-run # show payload only +# tools/branch-protection/apply.sh --branch staging +# +# Requires: gh CLI authenticated as a repo admin. The script uses gh's +# token (no separate PAT needed). + +set -euo pipefail + +REPO="Molecule-AI/molecule-core" +DRY_RUN=0 +ONLY_BRANCH="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) DRY_RUN=1; shift ;; + --branch) ONLY_BRANCH="$2"; shift 2 ;; + -h|--help) + echo "Usage: $0 [--dry-run] [--branch ]" + exit 0 + ;; + *) echo "Unknown arg: $1" >&2; exit 1 ;; + esac +done + +# Required-check matrices. Each branch's set is the canonical list of +# check NAMES (from each workflow's job-name). Adding/removing a check +# here is the place to do it. Match docs/e2e-coverage.md. +# +# Why staging gets E2E API + Canvas E2E (this PR's addition): both +# already use the always-emit pattern (path-filter no-ops emit SUCCESS), +# so making them required can't deadlock a PR that doesn't touch their +# paths. The other E2Es (SaaS, External) need a refactor to that +# pattern before they can be required — tracked as follow-up. + +read -r -d '' STAGING_CHECKS <<'EOF' || true +Analyze (go) +Analyze (javascript-typescript) +Analyze (python) +Canvas (Next.js) +Canvas tabs E2E +Detect changes +E2E API Smoke Test +Platform (Go) +Python Lint & Test +Scan diff for credential-shaped strings +Shellcheck (E2E scripts) +EOF + +read -r -d '' MAIN_CHECKS <<'EOF' || true +Analyze (go) +Analyze (javascript-typescript) +Analyze (python) +Canvas (Next.js) +Canvas tabs E2E +Detect changes +E2E API Smoke Test +PR-built wheel + import smoke +Platform (Go) +Python Lint & Test +Scan diff for credential-shaped strings +Shellcheck (E2E scripts) +EOF + +build_payload() { + local checks="$1" + local require_reviews="$2" # true / false + local checks_json + checks_json=$(printf '%s\n' "$checks" | jq -Rs ' + split("\n") + | map(select(length > 0)) + | map({context: ., app_id: -1}) + ') + jq -n \ + --argjson checks "$checks_json" \ + --argjson reviews "$require_reviews" \ + '{ + required_status_checks: { + strict: false, + checks: $checks + }, + enforce_admins: false, + required_pull_request_reviews: ( + if $reviews then + { required_approving_review_count: 1, dismiss_stale_reviews: true } + else null end + ), + restrictions: null, + allow_deletions: false, + allow_force_pushes: false, + block_creations: false, + required_conversation_resolution: true, + required_linear_history: false, + lock_branch: false, + allow_fork_syncing: true + }' +} + +apply_branch() { + local branch="$1" + local checks="$2" + local require_reviews="$3" + local payload + payload=$(build_payload "$checks" "$require_reviews") + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "=== branch: $branch ===" + echo "$payload" | jq . + return + fi + echo "Applying branch protection on $branch..." + printf '%s' "$payload" | gh api -X PUT \ + "repos/$REPO/branches/$branch/protection" \ + --input - + echo "Applied: $branch" +} + +if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "staging" ]]; then + apply_branch staging "$STAGING_CHECKS" true +fi +if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "main" ]]; then + apply_branch main "$MAIN_CHECKS" true +fi diff --git a/tools/branch-protection/drift_check.sh b/tools/branch-protection/drift_check.sh new file mode 100755 index 00000000..b25e23f0 --- /dev/null +++ b/tools/branch-protection/drift_check.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# tools/branch-protection/drift_check.sh — compare the live branch +# protection on staging + main against what apply.sh would set. Used +# by branch-protection-drift.yml (cron) to catch out-of-band UI edits. +# +# Exit codes: +# 0 — live state matches the script +# 1 — drift detected (output shows the diff) +# 2 — gh API call failed + +set -euo pipefail + +REPO="Molecule-AI/molecule-core" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXIT_CODE=0 + +check_branch() { + local branch="$1" + local want + want=$(bash "$SCRIPT_DIR/apply.sh" --dry-run --branch "$branch" 2>&1 | + sed -n '/^{$/,/^}$/p' | + jq -S '.required_status_checks.checks | map(.context) | sort') + local have + if ! have=$(gh api "repos/$REPO/branches/$branch/protection/required_status_checks" 2>/dev/null | + jq -S '.checks | map(.context) | sort'); then + echo "drift_check: FAIL to fetch $branch protection (gh API error)" + return 2 + fi + if [[ "$want" != "$have" ]]; then + echo "=== DRIFT on $branch ===" + echo "want:"; echo "$want" + echo "have:"; echo "$have" + diff <(echo "$want") <(echo "$have") || true + return 1 + fi + echo "OK: $branch matches desired state" +} + +for b in staging main; do + if ! check_branch "$b"; then + EXIT_CODE=1 + fi +done +exit "$EXIT_CODE" From 62fc25757cfa3557a1648f5dc1906f07431440f5 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:25:10 -0700 Subject: [PATCH 06/28] docs(backends): document Auto-dispatcher SoT pattern + source-level pins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #10. The 2026-05-05 hongming silent-drop incident shipped because the backends.md parity matrix didn't enforce a "go through the dispatcher" rule — three handlers (TeamHandler.Expand, OrgHandler.createWorkspaceTree, workspace_crud.go's stopAndRemove) silently bypassed routing on SaaS for ~6 months across two distinct verbs. This doc pass: - Adds a "How to dispatch" section that's the canonical answer to "where do I call Start / Stop / Has from?". Names the three dispatchers (provisionWorkspaceAuto, StopWorkspaceAuto, HasProvisioner), their fallbacks, and the allowed exceptions. - Updates the matrix lifecycle rows so every dispatched operation points at the dispatcher source, not the per-backend bodies. - Adds Org-import + Team-collapse rows so the bulk paths are visible to anyone scanning for parity gaps. - Lists the source-level pins (4 of them) under Enforcement so future contributors see them as load-bearing tests, not noise. - Adds a "When you add a NEW dispatch site" section so the next verb (Pause / Hibernate / Snapshot) lands as a dispatcher mirror, not as another bespoke handler that drifts from the existing two. - Refreshes Last audit to 2026-05-05. No code change; doc-only. The SoT abstractions described here landed in PRs #2811 + #2824. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/architecture/backends.md | 47 ++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/docs/architecture/backends.md b/docs/architecture/backends.md index ce01b247..0d5ba827 100644 --- a/docs/architecture/backends.md +++ b/docs/architecture/backends.md @@ -2,7 +2,7 @@ **Status:** living document — update when you ship a feature that touches one backend. **Owner:** workspace-server + controlplane teams. -**Last audit:** 2026-05-02 (Claude agent, PR #TBD). +**Last audit:** 2026-05-05 (Claude agent — `provisionWorkspaceAuto` / `StopWorkspaceAuto` / `HasProvisioner` SoT pattern landed in PRs #2811 + #2824). ## Why this exists @@ -15,16 +15,39 @@ Every user-visible workspace feature should work on both backends unless it is f This document is the canonical matrix. If you are landing a workspace-facing feature, update the row before you merge. +## How to dispatch (the SoT pattern) + +When a handler needs to start, stop, or check whether-something-can-run a workspace, it MUST go through the centralized dispatcher on `WorkspaceHandler`: + +| Need | Use | Source | +|---|---|---| +| Start a workspace | `provisionWorkspaceAuto(ctx, ...)` | `workspace.go:130` | +| Stop a workspace | `StopWorkspaceAuto(ctx, wsID)` | `workspace.go:172` | +| Gate "do we have any backend wired?" | `HasProvisioner()` | `workspace.go:115` | + +Each dispatcher routes to `cpProv.X()` when the SaaS backend is wired, then `provisioner.X()` when the Docker backend is wired, then a defined fallback (`provisionWorkspaceAuto` self-marks-failed; `StopWorkspaceAuto` no-ops; `HasProvisioner` returns false). + +**Rule: do not call `h.cpProv.Stop`, `h.provisioner.Stop`, `h.cpProv.Start`, or `h.provisioner.Start` directly from a handler.** Source-level pins (`TestNoCallSiteCallsDirectProvisionerExceptAuto`, `TestNoCallSiteCallsBareStop`) gate this at CI; they exist because the same drift class shipped twice — TeamHandler.Expand (#2367) bypassed routing on Start, then `team.go:208` + `workspace_crud.go:432` bypassed it on Stop (#2813, #2814) for ~6 months. + +Allowed exceptions (in the source-pin allowlists): +- `workspace.go` and `workspace_provision.go` — define the per-backend bodies the dispatcher routes between. +- `workspace_restart.go` — pre-dates the dispatchers and uses manual if-cpProv-else dispatch with retry semantics tuned for the restart hot path. Consolidation tracked in #2799. +- `container_files.go` — drives the Docker daemon directly for short-lived file-copy containers; no workspace-level Stop semantics involved. + +For "do we have any backend?", use `HasProvisioner()`, never bare `h.provisioner == nil && h.cpProv == nil`. Source-level pin `TestNoBareBothNilCheck` enforces this — added 2026-05-05 after the hongming org-import incident showed the bare check shape was a recurring drift target. + ## The matrix | Feature | File(s) | Docker | EC2 | Verdict | |---|---|---|---|---| | **Lifecycle** | | | | | -| Create | `workspace_provision.go:19-214` | `provisionWorkspace()` → `provisioner.Start()` | `provisionWorkspaceCP()` → `cpProv.Start()` | ✅ parity | +| Create | `workspace.go:130` `provisionWorkspaceAuto` → `provisionWorkspace()` (Docker) / `provisionWorkspaceCP()` (CP) | dispatched | dispatched | ✅ parity (single source of truth, PR #2811) | | Start | `provisioner.go:140-325` | container create + image pull | EC2 `RunInstance` via CP | ✅ parity | -| Stop | `provisioner.go:772-785` | `ContainerRemove(force=true)` + optional volume rm | `DELETE /cp/workspaces/:id` | ✅ parity | +| Stop | `workspace.go:172` `StopWorkspaceAuto` → `provisioner.Stop()` (Docker) / `cpProv.Stop()` (CP) | dispatched | dispatched | ✅ parity (single source of truth, PR #2824) | | Restart | `workspace_restart.go:45-210` | reads runtime from live container before stop | reads runtime from DB only | ⚠️ divergent — config-change + crash window can boot old runtime on EC2 | -| Delete | `workspace_crud.go` | stop + volume rm | stop only (stateless) | ✅ parity (expected divergence on volume cleanup) | +| Delete | `workspace_crud.go` `stopAndRemove` → `StopWorkspaceAuto` + Docker-only `RemoveVolume` | stop + volume rm | stop only (stateless — CP has no volumes) | ✅ parity (PR #2824 closed the SaaS-leak gap) | +| Org-import (bulk Create) | `org_import.go:178` gates on `h.workspace.HasProvisioner()`; routes through `provisionWorkspaceAuto` per workspace | dispatched | dispatched | ✅ parity (PR #2811 closed the SaaS-skip gate) | +| Team-collapse (bulk Stop) | `team.go:206` calls `StopWorkspaceAuto` for each child | dispatched | dispatched | ✅ parity (PR #2824 closed the SaaS-leak gap) | | **Secrets** | | | | | | Create / update | `secrets.go` | DB insert, injected at container start | DB insert, injected via user-data at boot | ✅ parity | | Redaction | `workspace_provision.go:251` | applied at memory-seed time | applied at agent runtime | ⚠️ divergent — timing differs | @@ -76,7 +99,23 @@ This document is the canonical matrix. If you are landing a workspace-facing fea - **`tools/check-template-parity.sh`** (this repo) — ensures `install.sh` and `start.sh` in a template repo forward identical sets of provider keys. Wire into each template repo's CI as `bash $MONOREPO/tools/check-template-parity.sh install.sh start.sh`. - **Contract tests** (stub) — `workspace-server/internal/provisioner/backend_contract_test.go` defines the behaviors every `provisioner.Provisioner` implementation must satisfy. Fails compile when a method drifts between `Docker` and `CPProvisioner`. Scenario-level runs are `t.Skip`'d today pending drift risk #6 (see above) — compile-time assertions still catch method drift. +- **Source-level dispatcher pins** — `workspace_provision_auto_test.go` enforces the SoT pattern documented above: + - `TestNoCallSiteCallsDirectProvisionerExceptAuto` — no handler calls `.provisionWorkspace(` or `.provisionWorkspaceCP(` directly outside the dispatcher's allowlist. + - `TestNoCallSiteCallsBareStop` — no handler calls `.provisioner.Stop(` or `.cpProv.Stop(` directly outside the dispatcher's allowlist (strips Go comments before substring match so archaeology in code comments doesn't trip the gate). + - `TestNoBareBothNilCheck` — no production code uses `h.provisioner == nil && h.cpProv == nil`; must use `!h.HasProvisioner()`. + - `TestOrgImportGate_UsesHasProvisionerNotBareField` — pins the org-import provisioning gate against the bare-Docker-check shape that caused the 2026-05-05 hongming incident. ## How to update this doc When you land a feature that touches a handler dispatch on `h.cpProv != nil`, add or update the matching row. If you can't implement both backends in the same PR, mark the row `docker-only` or `ec2-only` and file an issue tracking the gap. + +### When you add a NEW dispatch site + +If you find yourself writing `if h.cpProv != nil { ... } else if h.provisioner != nil { ... }` for a new operation (Pause, Hibernate, Snapshot, etc.): + +1. Add a `WorkspaceAuto` method on `WorkspaceHandler` next to the existing dispatchers. Mirror the docstring shape: routing, no-backend fallback, ordering rationale. +2. Add a source-level pin in `workspace_provision_auto_test.go` — the bare-call shape your dispatcher replaces, fail when a handler reintroduces it. +3. Add a row to the matrix above with the dispatcher reference. +4. If your operation has retry semantics specific to a hot path, leave them in the original location for now and file a follow-up under #2799 — don't bake retry into the generic dispatcher unless every caller benefits. + +The pattern is "one dispatcher per verb." Don't fold every operation into `provisionWorkspaceAuto` — different verbs have different no-backend fallbacks (mark-failed for Start, no-op for Stop, false for Has). From ba63f76e10a1309022c0e8def3dcfe1fe21996b6 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:33:13 -0700 Subject: [PATCH 07/28] feat(canvas/terminal): not-available banner for runtimes without a TTY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix TerminalTab tried to open /ws/terminal/ for every workspace including external ones (which have no shell endpoint on the workspace-server). The server returned 404, status flipped to "error", the user saw "Connection failed" with a Reconnect button — reading as a bug when really the runtime intentionally has no TTY. Now: when data.runtime is in RUNTIMES_WITHOUT_TERMINAL (currently just "external"), TerminalTab renders a NotAvailablePanel with a big terminal-off icon and a one-line explanation including the runtime name. The xterm + WebSocket dance is skipped entirely — no spurious 404s, no scary error UI, no Reconnect that can't help. The runtime is determined from the data prop now threaded by SidePanel.tsx (existing pattern for ChatTab/ConfigTab/etc). Tests: 4 new in TerminalTab.notAvailable.test.tsx pin: external renders banner with runtime name, external doesn't open WS, claude- code mounts normally (regression cover for the early-return scope), data omitted falls through (back-compat). Build clean. 1258 tests pass. --- canvas/src/components/SidePanel.tsx | 2 +- canvas/src/components/tabs/TerminalTab.tsx | 91 ++++++++++++++- .../TerminalTab.notAvailable.test.tsx | 107 ++++++++++++++++++ 3 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 canvas/src/components/tabs/__tests__/TerminalTab.notAvailable.test.tsx diff --git a/canvas/src/components/SidePanel.tsx b/canvas/src/components/SidePanel.tsx index 437aff90..2db6c4a1 100644 --- a/canvas/src/components/SidePanel.tsx +++ b/canvas/src/components/SidePanel.tsx @@ -283,7 +283,7 @@ export function SidePanel() { {panelTab === "skills" && } {panelTab === "activity" && } {panelTab === "chat" && } - {panelTab === "terminal" && } + {panelTab === "terminal" && } {panelTab === "config" && } {panelTab === "schedule" && } {panelTab === "channels" && } diff --git a/canvas/src/components/tabs/TerminalTab.tsx b/canvas/src/components/tabs/TerminalTab.tsx index 1c2c2027..43fc8e04 100644 --- a/canvas/src/components/tabs/TerminalTab.tsx +++ b/canvas/src/components/tabs/TerminalTab.tsx @@ -1,16 +1,105 @@ "use client"; import { useEffect, useRef, useState, useCallback } from "react"; +import type { WorkspaceNodeData } from "@/store/canvas"; interface Props { workspaceId: string; + /** Workspace metadata from the canvas store. Optional for back-compat + * with any caller that still mounts + * without threading data through (e.g. tests). When present, the + * runtime field gates the early-return below. */ + data?: WorkspaceNodeData; } import { deriveWsBaseUrl } from "@/lib/ws-url"; const WS_URL = deriveWsBaseUrl(); -export function TerminalTab({ workspaceId }: Props) { +/** + * NotAvailablePanel — full-tab placeholder with a big terminal-off icon + * for runtimes that don't expose a TTY (e.g. external workspaces, where + * the platform doesn't own the process). Pre-fix the tab tried to open + * a WebSocket against /ws/terminal/ for these workspaces, the server + * 404'd, and the user saw "Connection failed" — which reads as a bug, + * not as "this runtime intentionally has no shell". This banner makes + * the absence intentional. + */ +function NotAvailablePanel({ runtime }: { runtime: string }) { + return ( +
+ {/* Big terminal-off icon — bracket "[_]" with a slash through it. + Custom inline SVG so we don't depend on an icon set being + present at canvas build-time. */} + +

Terminal not available

+

+ This workspace runs the{" "} + {runtime} runtime, + which doesn't expose a shell. Use the Chat tab to interact with the + agent directly. +

+
+ ); +} + +/** Runtimes that don't expose a TTY. Keep narrow — only add a runtime + * here when its provisioner genuinely has no shell endpoint, otherwise + * the user loses access to a real debugging surface. */ +const RUNTIMES_WITHOUT_TERMINAL = new Set(["external"]); + +export function TerminalTab({ workspaceId, data }: Props) { + // Early-return for runtimes that have no shell. Skips the entire + // xterm + WebSocket dance below — without this, mounting the tab + // for an external workspace pops the WS, gets a 404 from the + // workspace-server (no /ws/terminal/ route registered for it), + // and shows "Connection failed" with a Reconnect button — confusing + // because the workspace IS healthy, just doesn't have a TTY. + if (data && RUNTIMES_WITHOUT_TERMINAL.has(data.runtime)) { + return ; + } + const containerRef = useRef(null); const termRef = useRef<{ dispose: () => void } | null>(null); const wsRef = useRef(null); diff --git a/canvas/src/components/tabs/__tests__/TerminalTab.notAvailable.test.tsx b/canvas/src/components/tabs/__tests__/TerminalTab.notAvailable.test.tsx new file mode 100644 index 00000000..df955a46 --- /dev/null +++ b/canvas/src/components/tabs/__tests__/TerminalTab.notAvailable.test.tsx @@ -0,0 +1,107 @@ +// @vitest-environment jsdom +// +// Pins the "Terminal not available" early-return added 2026-05-05. +// +// Pre-fix: TerminalTab tried to open /ws/terminal/ for every +// workspace including external runtimes (which have no shell endpoint). +// The server returned 404, status flipped to "error", user saw +// "Connection failed" with a Reconnect button — reading as a bug +// when really the runtime intentionally has no TTY. Now: when +// data.runtime is in RUNTIMES_WITHOUT_TERMINAL, render a banner + +// big icon instead of mounting xterm/WS. +// +// Pinned branches: +// 1. external runtime → "Terminal not available" banner renders, +// runtime name surfaces in the body so the user knows WHY. +// 2. external runtime → xterm + WebSocket are NOT initialised. +// Verified by checking the global WebSocket constructor isn't +// called. +// 3. claude-code (or any other runtime) → no banner, normal mount +// proceeds. Pre-fix regression cover. +// 4. data prop omitted (back-compat with any caller that doesn't +// thread it through) → no early-return, falls through to normal +// mount. Tested via the absence of the banner. + +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { render, screen, cleanup } from "@testing-library/react"; +import React from "react"; + +afterEach(cleanup); + +// xterm + addon-fit are dynamically imported by TerminalTab. Stub them +// so the tests don't pull a 200KB+ dependency just to verify the +// not-available banner. The stubs only matter for the non-banner +// branches; the banner returns BEFORE the dynamic import. +vi.mock("xterm", () => ({ + Terminal: vi.fn().mockImplementation(() => ({ + loadAddon: vi.fn(), + open: vi.fn(), + onData: vi.fn(), + write: vi.fn(), + dispose: vi.fn(), + onResize: vi.fn(), + cols: 80, + rows: 24, + })), +})); +vi.mock("@xterm/addon-fit", () => ({ + FitAddon: vi.fn().mockImplementation(() => ({ + fit: vi.fn(), + })), +})); + +// Track WebSocket constructor calls — this is the load-bearing +// assertion for "external doesn't even try to connect". +let wsConstructed = 0; +beforeEach(() => { + wsConstructed = 0; + (globalThis as unknown as { WebSocket: unknown }).WebSocket = vi + .fn() + .mockImplementation(() => { + wsConstructed++; + return { + addEventListener: vi.fn(), + removeEventListener: vi.fn(), + send: vi.fn(), + close: vi.fn(), + readyState: 0, + }; + }); +}); + +import { TerminalTab } from "../TerminalTab"; + +const externalData = { runtime: "external", status: "online" } as unknown as Parameters< + typeof TerminalTab +>[0]["data"]; + +const claudeData = { runtime: "claude-code", status: "online" } as unknown as Parameters< + typeof TerminalTab +>[0]["data"]; + +describe("TerminalTab not-available early-return for runtimes without TTY", () => { + it("external runtime renders the not-available banner with runtime name", () => { + render(); + expect(screen.getByText(/Terminal not available/i)).not.toBeNull(); + // Runtime name surfaces so user knows WHY there's no terminal. + expect(screen.getByText(/external/)).not.toBeNull(); + }); + + it("external runtime does NOT open a WebSocket", async () => { + render(); + // Wait a tick for any deferred init (there shouldn't be any, but + // tolerate a microtask boundary). + await new Promise((r) => setTimeout(r, 0)); + expect(wsConstructed).toBe(0); + }); + + it("claude-code runtime does NOT render the banner (normal mount)", () => { + render(); + expect(screen.queryByText(/Terminal not available/i)).toBeNull(); + }); + + it("data prop omitted falls through to normal mount (back-compat)", () => { + render(); + expect(screen.queryByText(/Terminal not available/i)).toBeNull(); + }); +}); From ed6dfe01e5dc00fc2d42221e0a6edefae4f16b1e Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:43:06 -0700 Subject: [PATCH 08/28] feat(delegations): durable per-task ledger + audit-write helper (RFC #2829 PR-1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the `delegations` table and the DelegationLedger writer that PRs #2-#4 of RFC #2829 build on. Schema-only foundation — no behavior change in this PR. PR-2 wires the ledger into the existing handlers and ships the result- push-to-inbox cutover behind a feature flag. Why a dedicated table when activity_logs already records every delegation event: Today, "what is currently in flight for this workspace" is reconstructed by GROUPing activity_logs by delegation_id and ORDER BY created_at DESC. PR-3's stuck-task sweeper needs the join SELECT delegation_id FROM delegations WHERE status = 'in_progress' AND last_heartbeat < now() - interval '10 minutes' which is impossible to express against the event stream without a window over every (delegation_id, latest event) pair — a planner-killing query at scale. The dedicated table makes the sweeper an indexed scan. Same posture as tenant_resources (PR #2343, memory `reference_tenant_resources_audit`): activity_logs remains the audit- grade source of truth, delegations is the queryable view for dashboards + sweeper joins. Symmetric writes — both tables are written, neither blocks orchestration on the other's failure. Schema highlights: - delegation_id PRIMARY KEY (caller-chosen, idempotent retry on restart is a no-op via ON CONFLICT DO NOTHING) - caller_id / callee_id NOT FK — workspace delete must NOT cascade- delete delegation history (audit retention) - status CHECK constraint enforces the lifecycle (queued|dispatched|in_progress|completed|failed|stuck) - last_heartbeat NULL-able; PR-3 sweeper compares to NOW() - deadline default now()+6h matches longest-observed legit delegation (memory-namespace migrations) — protects against forever-heartbeating wedged agents - Partial index `idx_delegations_inflight_heartbeat` keeps the sweeper hot path tiny (only non-terminal rows) - UNIQUE(caller_id, idempotency_key) WHERE NOT NULL — natural collision becomes ON CONFLICT no-op without colliding across callers DelegationLedger.SetStatus enforces forward-only on terminal states (completed/failed/stuck cannot be revised) as defense-in-depth on the schema CHECK. Same-status replay is a no-op. Missing-row SetStatus is a no-op (transient inconsistency the next agent retry will heal). Heartbeat updates only in-flight rows — terminal-state delegations are silently skipped. Coverage: - 17 unit tests against sqlmock-backed *sql.DB (Insert happy path, missing-required guards, truncation, lifecycle transitions, terminal forward-only protection, replay no-op, missing-row no-op, empty-input rejection, heartbeat semantics, transition table shape) - Migration roundtrip verified on a real Postgres 15 instance: up creates the expected schema with all 4 indexes + CHECK, down drops everything cleanly. Refs RFC #2829. --- .../internal/handlers/delegation_ledger.go | 200 +++++++++++ .../handlers/delegation_ledger_test.go | 312 ++++++++++++++++++ .../migrations/049_delegations.down.sql | 5 + .../migrations/049_delegations.up.sql | 99 ++++++ 4 files changed, 616 insertions(+) create mode 100644 workspace-server/internal/handlers/delegation_ledger.go create mode 100644 workspace-server/internal/handlers/delegation_ledger_test.go create mode 100644 workspace-server/migrations/049_delegations.down.sql create mode 100644 workspace-server/migrations/049_delegations.up.sql diff --git a/workspace-server/internal/handlers/delegation_ledger.go b/workspace-server/internal/handlers/delegation_ledger.go new file mode 100644 index 00000000..9a783ece --- /dev/null +++ b/workspace-server/internal/handlers/delegation_ledger.go @@ -0,0 +1,200 @@ +package handlers + +import ( + "context" + "database/sql" + "errors" + "log" + "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" +) + +// delegation_ledger.go — durable per-task ledger for A2A delegation +// (RFC #2829 PR-1). +// +// activity_logs is an event stream — one row per state transition. Replaying +// the stream gives you history. This file's table (delegations) is the +// folded current state — one row per delegation_id with a single status, +// last_heartbeat, deadline, and result_preview. +// +// Why both: PR-3 needs a sweeper that joins on +// (status='in_progress' AND last_heartbeat < now() - interval '10 minutes') +// which is impossible to express against the event stream without a window +// function over every (delegation_id, latest event) pair — a planner-killing +// query at scale. The dedicated table makes the sweeper an indexed scan. +// +// Writes go to BOTH tables. activity_logs remains the audit-grade record +// for forensics; delegations is the queryable view for dashboards + sweeper +// joins. Symmetric-write pattern — same posture as tenant_resources (PR +// #2343), per memory `reference_tenant_resources_audit`. + +// DelegationLedger writes the per-task durable row alongside the existing +// activity_logs event-stream writes. All methods are best-effort: a ledger +// write failure logs but does NOT propagate up — activity_logs remains the +// audit-grade source of truth. +// +// Same shape as `tenant_resources` reconciler (PR #2343): orchestration +// continues even when the ledger write fails, and the next status update +// (or PR-3 reconciler) will heal the ledger. +type DelegationLedger struct { + db *sql.DB +} + +// NewDelegationLedger returns a ledger backed by the package db handle. +// Tests can construct one with a sqlmock-backed *sql.DB. +func NewDelegationLedger(handle *sql.DB) *DelegationLedger { + if handle == nil { + handle = db.DB + } + return &DelegationLedger{db: handle} +} + +// truncatePreview caps stored preview at 4KB. The full prompt/response is +// already in activity_logs.{request,response}_body — this is the at-a-glance +// view for the dashboard, not a forensic record. +const previewCap = 4096 + +func truncatePreview(s string) string { + if len(s) <= previewCap { + return s + } + return s[:previewCap] +} + +// InsertOpts is the agent's record-of-intent. Caller, callee, task preview, +// and the chosen delegation_id are required; idempotency_key is optional. +type InsertOpts struct { + DelegationID string + CallerID string + CalleeID string + TaskPreview string + IdempotencyKey string // empty → NULL + // Deadline defaults to now + 6h when zero. Callers can pass a tighter + // per-task deadline (cron, interactive request) by setting it. + Deadline time.Time +} + +// Insert writes the queued row. ON CONFLICT (delegation_id) DO NOTHING so +// the agent's retry-on-restart codepath is naturally idempotent — a duplicate +// Insert with the same delegation_id is a no-op. (Idempotency_key dedupe is +// a separate UNIQUE index handled by the same DO NOTHING.) +func (l *DelegationLedger) Insert(ctx context.Context, opts InsertOpts) { + if opts.DelegationID == "" || opts.CallerID == "" || opts.CalleeID == "" { + log.Printf("delegation_ledger Insert: missing required field, skipping") + return + } + deadline := opts.Deadline + if deadline.IsZero() { + deadline = time.Now().Add(6 * time.Hour) + } + idemArg := sql.NullString{String: opts.IdempotencyKey, Valid: opts.IdempotencyKey != ""} + _, err := l.db.ExecContext(ctx, ` + INSERT INTO delegations ( + delegation_id, caller_id, callee_id, task_preview, + status, deadline, idempotency_key + ) VALUES ($1, $2, $3, $4, 'queued', $5, $6) + ON CONFLICT (delegation_id) DO NOTHING + `, opts.DelegationID, opts.CallerID, opts.CalleeID, + truncatePreview(opts.TaskPreview), deadline, idemArg) + if err != nil { + log.Printf("delegation_ledger Insert(%s): %v", opts.DelegationID, err) + } +} + +// allowedTransitions enforces the lifecycle in code as defense-in-depth on +// the schema CHECK. Terminal states (completed, failed, stuck) reject any +// further status update — once a delegation is done, it stays done. +// +// The "queued → in_progress" jump (skipping dispatched) is allowed: lazy +// callers that don't ack the dispatched stage shouldn't be penalised, +// since the agent ultimately cares about whether work started, not which +// HTTP layer happened to ack first. +var allowedTransitions = map[string]map[string]bool{ + "queued": {"dispatched": true, "in_progress": true, "failed": true}, + "dispatched": {"in_progress": true, "completed": true, "failed": true}, + "in_progress": {"completed": true, "failed": true, "stuck": true}, +} + +// ErrInvalidTransition is returned by SetStatus when the transition would +// move out of a terminal state. Callers SHOULD ignore (it's a duplicate +// terminal write) but they're surfaced for tests. +var ErrInvalidTransition = errors.New("delegation ledger: invalid status transition") + +// SetStatus is the catch-all updater. Status MUST be one of the lifecycle +// values. errorDetail is non-empty only for failed/stuck. resultPreview is +// non-empty only for completed. +// +// Idempotent: re-applying the same terminal status with the same payload +// returns nil; transitioning back out of a terminal state returns +// ErrInvalidTransition. (Forward-only protection — once 'completed' you +// don't get to revise to 'failed'.) +func (l *DelegationLedger) SetStatus(ctx context.Context, + delegationID, status, errorDetail, resultPreview string, +) error { + if delegationID == "" || status == "" { + return errors.New("delegation ledger: missing required field") + } + + // Read current status to validate the transition. We accept the rare + // race where two updaters both observe the same prior status — Postgres + // CHECK constraint catches truly-invalid status values; our forward-only + // check is best-effort. + var current string + err := l.db.QueryRowContext(ctx, + `SELECT status FROM delegations WHERE delegation_id = $1`, + delegationID, + ).Scan(¤t) + if errors.Is(err, sql.ErrNoRows) { + // Insert was lost or wasn't called. Defensively NO-OP — the next + // agent retry will re-Insert and the next SetStatus will land. + log.Printf("delegation_ledger SetStatus(%s, %s): row missing, skipping", + delegationID, status) + return nil + } + if err != nil { + return err + } + + // Same-status replay (e.g. duplicate completion notification): no-op, + // don't bump updated_at, no error. + if current == status { + return nil + } + + // Forward-only on terminal states. + if next, ok := allowedTransitions[current]; !ok || !next[status] { + // Terminal already — refuse to revise. + return ErrInvalidTransition + } + + _, err = l.db.ExecContext(ctx, ` + UPDATE delegations + SET status = $2, + error_detail = NULLIF($3, ''), + result_preview = NULLIF($4, ''), + updated_at = now() + WHERE delegation_id = $1 + `, delegationID, status, errorDetail, truncatePreview(resultPreview)) + return err +} + +// Heartbeat stamps last_heartbeat = now() for an in-flight delegation. Used +// by the callee whenever it makes progress; PR-3's sweeper compares to +// NOW() to decide stuckness. No-op on terminal-state delegations. +// +// Best-effort: failure logs but doesn't propagate. +func (l *DelegationLedger) Heartbeat(ctx context.Context, delegationID string) { + if delegationID == "" { + return + } + _, err := l.db.ExecContext(ctx, ` + UPDATE delegations + SET last_heartbeat = now(), updated_at = now() + WHERE delegation_id = $1 + AND status NOT IN ('completed','failed','stuck') + `, delegationID) + if err != nil { + log.Printf("delegation_ledger Heartbeat(%s): %v", delegationID, err) + } +} diff --git a/workspace-server/internal/handlers/delegation_ledger_test.go b/workspace-server/internal/handlers/delegation_ledger_test.go new file mode 100644 index 00000000..78cb4b94 --- /dev/null +++ b/workspace-server/internal/handlers/delegation_ledger_test.go @@ -0,0 +1,312 @@ +package handlers + +import ( + "context" + "errors" + "strings" + "testing" + + "github.com/DATA-DOG/go-sqlmock" +) + +// delegation_ledger_test.go — unit coverage for the durable ledger writer +// (RFC #2829 PR-1). +// +// Coverage targets: +// - Insert: happy path; missing-required no-op; deadline default; +// idempotency_key NULL vs string passthrough. +// - SetStatus: queued→dispatched→in_progress→completed; same-status +// replay no-op; terminal state forward-only protection; missing row +// no-op; SQL error propagation. +// - Heartbeat: stamps now() on in-flight; no-op on terminal; missing-id +// guard. +// - truncatePreview: under-cap passthrough; over-cap truncates. + +// ---------- Insert ---------- + +func TestLedgerInsert_HappyPath(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) // uses package db.DB which sqlmock replaced + + mock.ExpectExec(`INSERT INTO delegations`). + WithArgs( + "deleg-123", + "caller-uuid", + "callee-uuid", + "task body", + sqlmock.AnyArg(), // deadline (default = now+6h) + sqlmock.AnyArg(), // idempotency_key NullString + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + l.Insert(context.Background(), InsertOpts{ + DelegationID: "deleg-123", + CallerID: "caller-uuid", + CalleeID: "callee-uuid", + TaskPreview: "task body", + }) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +func TestLedgerInsert_MissingRequired_NoSQLFired(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + // Caller-side guard: no DB call expected. + for _, opts := range []InsertOpts{ + {DelegationID: "", CallerID: "c", CalleeID: "ca", TaskPreview: "t"}, + {DelegationID: "d", CallerID: "", CalleeID: "ca", TaskPreview: "t"}, + {DelegationID: "d", CallerID: "c", CalleeID: "", TaskPreview: "t"}, + } { + l.Insert(context.Background(), opts) + } + // No ExpectExec → ExpectationsWereMet stays clean. + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected sqlmock activity: %v", err) + } +} + +func TestLedgerInsert_TruncatesOversizedPreview(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + huge := strings.Repeat("x", 10_000) // > previewCap + + mock.ExpectExec(`INSERT INTO delegations`). + WithArgs( + "deleg-big", + "c", "ca", + sqlmock.AnyArg(), // truncated preview — verify length below via custom matcher + sqlmock.AnyArg(), + sqlmock.AnyArg(), + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + l.Insert(context.Background(), InsertOpts{ + DelegationID: "deleg-big", + CallerID: "c", + CalleeID: "ca", + TaskPreview: huge, + }) + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +// ---------- truncatePreview unit ---------- + +func TestTruncatePreview_UnderCap(t *testing.T) { + in := "short" + if got := truncatePreview(in); got != in { + t.Errorf("under-cap should passthrough; got %q", got) + } +} + +func TestTruncatePreview_OverCapTruncatesAtBoundary(t *testing.T) { + in := strings.Repeat("a", previewCap+100) + got := truncatePreview(in) + if len(got) != previewCap { + t.Errorf("expected len=%d got len=%d", previewCap, len(got)) + } +} + +func TestTruncatePreview_ExactlyAtCap(t *testing.T) { + in := strings.Repeat("a", previewCap) + got := truncatePreview(in) + if got != in { + t.Errorf("at-cap should passthrough unchanged") + } +} + +// ---------- SetStatus lifecycle ---------- + +func TestLedgerSetStatus_QueuedToDispatched(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("d-1"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("queued")) + + mock.ExpectExec(`UPDATE delegations`). + WithArgs("d-1", "dispatched", "", ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + if err := l.SetStatus(context.Background(), "d-1", "dispatched", "", ""); err != nil { + t.Errorf("unexpected: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestLedgerSetStatus_QueuedToInProgress_SkipsDispatched(t *testing.T) { + // Lazy callers that go queued → in_progress without ack should be allowed. + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("d-1"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("queued")) + + mock.ExpectExec(`UPDATE delegations`). + WithArgs("d-1", "in_progress", "", ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + if err := l.SetStatus(context.Background(), "d-1", "in_progress", "", ""); err != nil { + t.Errorf("unexpected: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestLedgerSetStatus_InProgressToCompleted_StoresResult(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("d-1"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("in_progress")) + + mock.ExpectExec(`UPDATE delegations`). + WithArgs("d-1", "completed", "", "answer text"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + if err := l.SetStatus(context.Background(), "d-1", "completed", "", "answer text"); err != nil { + t.Errorf("unexpected: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestLedgerSetStatus_TerminalForwardOnly(t *testing.T) { + // completed → failed must be rejected: terminal states are forward-only. + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("d-done"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("completed")) + + err := l.SetStatus(context.Background(), "d-done", "failed", "post-hoc error", "") + if !errors.Is(err, ErrInvalidTransition) { + t.Errorf("expected ErrInvalidTransition, got %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestLedgerSetStatus_SameStatusReplay_NoUpdate(t *testing.T) { + // Re-applying the same terminal status should NOT bump updated_at — + // duplicate completion notifications shouldn't generate spurious writes. + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("d-1"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("completed")) + + // No ExpectExec — UPDATE must not fire. + if err := l.SetStatus(context.Background(), "d-1", "completed", "", ""); err != nil { + t.Errorf("same-status replay should be no-op, got err: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet (or unexpected UPDATE): %v", err) + } +} + +func TestLedgerSetStatus_MissingRowIsNoOp(t *testing.T) { + // A SetStatus call that arrives before Insert (lost INSERT, race, etc.) + // must NOT error — it's a transient inconsistency the next agent retry + // will heal. + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("d-missing"). + WillReturnRows(sqlmock.NewRows([]string{"status"})) // empty + + if err := l.SetStatus(context.Background(), "d-missing", "completed", "", "ok"); err != nil { + t.Errorf("missing row should be no-op; got err: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestLedgerSetStatus_RejectsEmptyDelegationID(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + if err := l.SetStatus(context.Background(), "", "completed", "", ""); err == nil { + t.Errorf("expected error for empty delegation_id") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected sqlmock activity for empty input: %v", err) + } +} + +func TestLedgerSetStatus_RejectsEmptyStatus(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + if err := l.SetStatus(context.Background(), "d-1", "", "", ""); err == nil { + t.Errorf("expected error for empty status") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected sqlmock activity for empty input: %v", err) + } +} + +// ---------- Heartbeat ---------- + +func TestLedgerHeartbeat_StampsInflightRow(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + mock.ExpectExec(`UPDATE delegations`). + WithArgs("d-1"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + l.Heartbeat(context.Background(), "d-1") + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestLedgerHeartbeat_EmptyIDIsNoOp(t *testing.T) { + mock := setupTestDB(t) + l := NewDelegationLedger(nil) + + l.Heartbeat(context.Background(), "") // no SQL expected + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected SQL on empty id: %v", err) + } +} + +// ---------- Allowed-transition table ---------- + +// TestAllowedTransitionsTableShape pins the lifecycle map: every starting +// state must have at least one outbound transition, and every terminal +// state (completed/failed/stuck) must be ABSENT from the map keys (forward- +// only enforcement). Catches accidental edits that re-add an outbound edge +// from a terminal state. +func TestAllowedTransitionsTableShape(t *testing.T) { + for _, terminal := range []string{"completed", "failed", "stuck"} { + if _, has := allowedTransitions[terminal]; has { + t.Errorf("terminal state %q must not appear as transition source", terminal) + } + } + for src, dests := range allowedTransitions { + if len(dests) == 0 { + t.Errorf("non-terminal state %q has no outbound transitions", src) + } + } +} diff --git a/workspace-server/migrations/049_delegations.down.sql b/workspace-server/migrations/049_delegations.down.sql new file mode 100644 index 00000000..03fccbc4 --- /dev/null +++ b/workspace-server/migrations/049_delegations.down.sql @@ -0,0 +1,5 @@ +DROP INDEX IF EXISTS idx_delegations_idempotency; +DROP INDEX IF EXISTS idx_delegations_callee_created; +DROP INDEX IF EXISTS idx_delegations_caller_created; +DROP INDEX IF EXISTS idx_delegations_inflight_heartbeat; +DROP TABLE IF EXISTS delegations; diff --git a/workspace-server/migrations/049_delegations.up.sql b/workspace-server/migrations/049_delegations.up.sql new file mode 100644 index 00000000..d306859a --- /dev/null +++ b/workspace-server/migrations/049_delegations.up.sql @@ -0,0 +1,99 @@ +-- RFC #2829 PR-1: durable delegations ledger. +-- +-- Today, delegation state is reconstructed by GROUPing activity_logs rows +-- by delegation_id and ORDER BY created_at DESC. Three problems: +-- +-- 1. No queryable "what is currently in flight for this workspace" — every +-- caller has to fold the event stream itself. +-- 2. No place to durably stamp last_heartbeat / deadline on a per-task +-- basis, so a stuck-task sweeper has nothing to scan. +-- 3. The 600s message/send proxy timeout (the user's 2026-05-05 home-hermes +-- iteration-14/90 incident) leaves the in-flight HTTP connection holding +-- all the state — caller restart, callee restart, proxy timeout all kill +-- the delegation. activity_logs can replay the *intent* but not the +-- *current state* without the row that says "yes this is still alive". +-- +-- This table is the durable ledger that PRs #2-#4 build on: +-- PR-2 — push result to caller's inbox + use this row to track readiness +-- PR-3 — sweeper joins on (status='in_progress', last_heartbeatthreshold; +-- operator can transition to failed via dashboard (PR-4) + +CREATE TABLE IF NOT EXISTS delegations ( + -- delegation_id chosen by the caller so callee + caller agree on the key + -- without a database round-trip. UUID, but stored as TEXT to match the + -- existing agent-side string contract (delegation.py uses str(uuid4())). + delegation_id text PRIMARY KEY, + + -- Caller is the workspace that initiated the delegation. Callee is the + -- target. Both reference workspaces, but we don't FK them — workspace + -- delete should NOT cascade-delete delegations history (audit retention). + -- Same posture as tenant_resources (PR #2343). + caller_id uuid NOT NULL, + callee_id uuid NOT NULL, + + -- Truncated at insertion so a 50KB prompt doesn't bloat the ledger; the + -- full prompt lives in activity_logs.request_body for forensic replay. + task_preview text NOT NULL, + + status text NOT NULL DEFAULT 'queued' + CHECK (status IN ('queued','dispatched','in_progress','completed','failed','stuck')), + + -- Stamped by callee heartbeats (PR-3 sweeper compares to NOW()). NULL + -- before any heartbeat — sweeper treats NULL same as last_heartbeat + -- < (created_at) for stuckness purposes. + last_heartbeat timestamptz, + + -- Hard deadline. Beyond this, sweeper marks `failed` regardless of + -- heartbeat liveness — protects against agents that heartbeat forever + -- without making progress. Default 6h matches the longest-observed legit + -- delegation in production (memory-namespace migration runs). + deadline timestamptz NOT NULL DEFAULT (now() + interval '6 hours'), + + -- Truncated result preview (full result in activity_logs response_body). + -- Set on terminal completed transition. + result_preview text, + + -- Set on failed/stuck terminal transition. + error_detail text, + + -- For PR-3 retry policy. Not used in PR-1 — declared so PR-3 doesn't + -- need a follow-on migration. + retry_count integer NOT NULL DEFAULT 0, + + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + + -- Idempotency: the agent-side delegate_task call accepts an idempotency + -- key. Two records of the same key collapse to one row. Indexed UNIQUE + -- where non-null so the natural collision becomes an INSERT … ON + -- CONFLICT no-op. + idempotency_key text +); + +-- Sweeper hot path (PR-3): list everything that's in_progress and overdue +-- for a heartbeat. Partial index on non-terminal status keeps this small. +CREATE INDEX IF NOT EXISTS idx_delegations_inflight_heartbeat + ON delegations (last_heartbeat NULLS FIRST) + WHERE status IN ('queued','dispatched','in_progress'); + +-- Operator dashboard (PR-4): per-workspace recent delegations. +CREATE INDEX IF NOT EXISTS idx_delegations_caller_created + ON delegations (caller_id, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_delegations_callee_created + ON delegations (callee_id, created_at DESC); + +-- Idempotency dedupe: composite (caller_id, idempotency_key) so two +-- different callers can use the same key without colliding. +CREATE UNIQUE INDEX IF NOT EXISTS idx_delegations_idempotency + ON delegations (caller_id, idempotency_key) + WHERE idempotency_key IS NOT NULL; From ae79b9e9fe55883bcaafa9f6b83bc5d724280c09 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:50:46 -0700 Subject: [PATCH 09/28] feat(delegations): result-push to caller inbox behind feature flag (RFC #2829 PR-2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a delegation completes (or fails), also write an `activity_type='a2a_receive'` row to the caller's activity_logs so the caller's inbox poller (workspace/inbox.py — `?type=a2a_receive`) surfaces the result to the agent. Why: today the only way the caller agent learns about a delegation result is by holding open an HTTP `message/send` connection through the platform proxy. That connection has a hard timeout (~600s) — a 90-iteration external-runtime task on stream output routinely blows past it, and the result emitted after the timeout lands in /dev/null. (Hongming's home hermes hit this on 2026-05-05 — task was actively heartbeating "iteration 14/90" when the proxy timer fired.) This PR adds the SERVER-SIDE result-push so the result is durably delivered to the caller's inbox queue. The agent-side cutover (replace sync httpx delegation with delegate_task_async + wait_for_message poll) ships in the next PR — once both land, the proxy timeout class is gone. ## Feature flag `DELEGATION_RESULT_INBOX_PUSH=1` enables the push. Default off — staging canary first, flip after RFC #2829 PR-3 (agent-side) lands and proves the round-trip end-to-end. With the flag off, behavior is byte-identical to before this PR (verified by TestUpdateStatus_FlagOff_NoNewSQL). ## Two write sites 1. UpdateStatus handler (POST /workspaces/:id/delegations/:id/update) — agent-initiated delegations report status here 2. executeDelegation goroutine — canvas-initiated delegations (POST /workspaces/:id/delegate) report status from this background coroutine Both paths call `pushDelegationResultToInbox` which is best-effort: an INSERT failure logs but does NOT propagate up. The existing `delegate_result` row in activity_logs (the dashboard view) remains authoritative; the new `a2a_receive` row is purely additive for the inbox-poller to surface. ## Coverage 6 new tests in delegation_inbox_push_test.go: - flag off → no SQL fired (the rollout-safety contract) - flag on, completed → a2a_receive row with status=ok - flag on, failed → a2a_receive row with status=error + error_detail - UpdateStatus end-to-end (flag on, completed) - UpdateStatus end-to-end (flag on, failed) - UpdateStatus end-to-end (flag off, byte-identical to pre-PR behavior) All 30 existing delegation_test.go tests still pass — flag default off keeps the strict-sqlmock surface unchanged. Refs RFC #2829. --- .../internal/handlers/delegation.go | 75 ++++++ .../handlers/delegation_inbox_push_test.go | 246 ++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 workspace-server/internal/handlers/delegation_inbox_push_test.go diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go index 59da198c..e1b5e1a8 100644 --- a/workspace-server/internal/handlers/delegation.go +++ b/workspace-server/internal/handlers/delegation.go @@ -5,6 +5,7 @@ import ( "encoding/json" "log" "net/http" + "os" "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" @@ -13,6 +14,68 @@ import ( "github.com/google/uuid" ) +// delegationResultInboxPushEnabled gates the RFC #2829 PR-2 result-push +// behavior: when callee POSTs `status=completed` (or `failed`) via +// /workspaces/:id/delegations/:delegation_id/update, ALSO write an +// `activity_type='a2a_receive'` row to the caller's activity_logs. +// +// Why a flag: the caller's inbox poller (workspace/inbox.py) queries +// `?type=a2a_receive` to surface inbound messages to the agent. Adding +// a2a_receive rows for delegation results is the universal-sized fix for +// the 600s message/send timeout class — long-running delegations no +// longer rely on the proxy holding the HTTP connection open. But it is +// observable behavior change (existing agents start seeing delegation +// results in their inbox where they didn't before), so we flag it for +// staging burn-in before flipping default. +// +// Default: off. Staging-canary first; flip to on after RFC #2829 PR-3 +// (agent-side cutover) lands and proves the round-trip end-to-end. +func delegationResultInboxPushEnabled() bool { + return os.Getenv("DELEGATION_RESULT_INBOX_PUSH") == "1" +} + +// pushDelegationResultToInbox writes the inbox-visible row for a +// completed/failed delegation. Best-effort: a failure logs but does NOT +// fail the parent UpdateStatus — the existing delegate_result row in +// activity_logs is still authoritative for the dashboard. +// +// Caller (sourceID) is the workspace that initiated the delegation; the +// inbox row lands in their activity_logs so wait_for_message picks it up. +// +// Body shape mirrors a2a_receive rows produced by the proxy on a +// successful synchronous reply: response_body.text carries the agent's +// answer, request_body.delegation_id correlates back to the originating +// row. +func pushDelegationResultToInbox(ctx context.Context, sourceID, delegationID, status, responsePreview, errorDetail string) { + if !delegationResultInboxPushEnabled() { + return + } + respPayload := map[string]interface{}{ + "text": responsePreview, + "delegation_id": delegationID, + } + respJSON, _ := json.Marshal(respPayload) + reqJSON, _ := json.Marshal(map[string]interface{}{ + "delegation_id": delegationID, + }) + logStatus := "ok" + if status == "failed" { + logStatus = "error" + } + summary := "Delegation result delivered" + if status == "failed" { + summary = "Delegation failed" + } + if _, err := db.DB.ExecContext(ctx, ` + INSERT INTO activity_logs ( + workspace_id, activity_type, method, source_id, + summary, request_body, response_body, status, error_detail + ) VALUES ($1, 'a2a_receive', 'delegate_result', $2, $3, $4::jsonb, $5::jsonb, $6, NULLIF($7, '')) + `, sourceID, sourceID, summary, string(reqJSON), string(respJSON), logStatus, errorDetail); err != nil { + log.Printf("Delegation %s: inbox-push insert failed: %v", delegationID, err) + } +} + // Delegation status lifecycle: // pending → dispatched → received → in_progress → completed | failed // @@ -289,6 +352,8 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_FAILED", sourceID, map[string]interface{}{ "delegation_id": delegationID, "target_id": targetID, "error": proxyErr.Error(), }) + // RFC #2829 PR-2 result-push (see UpdateStatus for rationale). + pushDelegationResultToInbox(ctx, sourceID, delegationID, "failed", "", proxyErr.Error()) return } @@ -349,6 +414,8 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s "target_id": targetID, "response_preview": truncate(responseText, 200), }) + // RFC #2829 PR-2 result-push (see UpdateStatus for rationale). + pushDelegationResultToInbox(ctx, sourceID, delegationID, "completed", responseText, "") } // updateDelegationStatus updates the status of a delegation record in activity_logs. @@ -459,11 +526,19 @@ func (h *DelegationHandler) UpdateStatus(c *gin.Context) { "delegation_id": delegationID, "response_preview": truncate(body.ResponsePreview, 200), }) + // RFC #2829 PR-2 result-push: when the gate is on, also write an + // a2a_receive row so the caller's inbox poller surfaces this to + // the agent. Foundational for getting rid of the proxy-blocked + // sync path that hits the 600s message/send timeout — once the + // agent-side cutover lands, the caller polls its own inbox for + // the result instead of holding open an HTTP connection. + pushDelegationResultToInbox(ctx, sourceID, delegationID, "completed", body.ResponsePreview, "") } else { h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_FAILED", sourceID, map[string]interface{}{ "delegation_id": delegationID, "error": body.Error, }) + pushDelegationResultToInbox(ctx, sourceID, delegationID, "failed", "", body.Error) } c.JSON(http.StatusOK, gin.H{"status": body.Status, "delegation_id": delegationID}) diff --git a/workspace-server/internal/handlers/delegation_inbox_push_test.go b/workspace-server/internal/handlers/delegation_inbox_push_test.go new file mode 100644 index 00000000..02c51190 --- /dev/null +++ b/workspace-server/internal/handlers/delegation_inbox_push_test.go @@ -0,0 +1,246 @@ +package handlers + +import ( + "bytes" + "context" + "net/http" + "net/http/httptest" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// delegation_inbox_push_test.go — coverage for the RFC #2829 PR-2 +// result-push behavior. The push is feature-flagged via +// DELEGATION_RESULT_INBOX_PUSH=1; default off keeps the existing +// strict-sqlmock test surface unchanged. +// +// What we pin: +// 1. Flag off (default) → no a2a_receive INSERT fires. +// 2. Flag on, status=completed → a2a_receive row written with the +// response_preview and no error_detail. +// 3. Flag on, status=failed → a2a_receive row written with status=error +// and the error_detail set. +// 4. INSERT failure on inbox-push does NOT bubble up — UpdateStatus +// still returns 200. + +// ---------- pushDelegationResultToInbox in isolation ---------- + +func TestPushDelegationResultToInbox_FlagOff_NoSQL(t *testing.T) { + mock := setupTestDB(t) + t.Setenv("DELEGATION_RESULT_INBOX_PUSH", "") + + pushDelegationResultToInbox( + context.Background(), + "caller", "deleg-1", "completed", "answer body", "", + ) + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("flag off must not fire SQL: %v", err) + } +} + +func TestPushDelegationResultToInbox_FlagOn_CompletedInsertsA2AReceiveRow(t *testing.T) { + mock := setupTestDB(t) + t.Setenv("DELEGATION_RESULT_INBOX_PUSH", "1") + + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "caller-ws", + "caller-ws", // source_id mirrors workspace_id + "Delegation result delivered", + sqlmock.AnyArg(), // request_body json + sqlmock.AnyArg(), // response_body json + "ok", + "", // error_detail empty for completed + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + pushDelegationResultToInbox( + context.Background(), + "caller-ws", "deleg-1", "completed", "answer body", "", + ) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestPushDelegationResultToInbox_FlagOn_FailedInsertsErrorRow(t *testing.T) { + mock := setupTestDB(t) + t.Setenv("DELEGATION_RESULT_INBOX_PUSH", "1") + + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "caller-ws", + "caller-ws", + "Delegation failed", + sqlmock.AnyArg(), + sqlmock.AnyArg(), + "error", + "target unreachable", + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + pushDelegationResultToInbox( + context.Background(), + "caller-ws", "deleg-2", "failed", "", "target unreachable", + ) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +// ---------- UpdateStatus end-to-end ---------- + +func TestUpdateStatus_FlagOn_PushesA2AReceiveOnCompleted(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + t.Setenv("DELEGATION_RESULT_INBOX_PUSH", "1") + + broadcaster := newTestBroadcaster() + wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + dh := NewDelegationHandler(wh, broadcaster) + + // 1. updateDelegationStatus — UPDATE activity_logs SET status='completed' + mock.ExpectExec(`UPDATE activity_logs`). + WithArgs("completed", "", "ws-source", "deleg-9"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // 2. existing delegate_result INSERT (caller-side dashboard view) + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "ws-source", "ws-source", + sqlmock.AnyArg(), // summary + sqlmock.AnyArg(), // response_body + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // 3. NEW: PR-2 a2a_receive row for inbox-poller + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "ws-source", "ws-source", + "Delegation result delivered", + sqlmock.AnyArg(), + sqlmock.AnyArg(), + "ok", + "", + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: "ws-source"}, + {Key: "delegation_id", Value: "deleg-9"}, + } + body := `{"status":"completed","response_preview":"all done"}` + c.Request = httptest.NewRequest("POST", + "/workspaces/ws-source/delegations/deleg-9/update", + bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + dh.UpdateStatus(c) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +func TestUpdateStatus_FlagOn_PushesA2AReceiveOnFailed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + t.Setenv("DELEGATION_RESULT_INBOX_PUSH", "1") + + broadcaster := newTestBroadcaster() + wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + dh := NewDelegationHandler(wh, broadcaster) + + // 1. updateDelegationStatus — UPDATE activity_logs + mock.ExpectExec(`UPDATE activity_logs`). + WithArgs("failed", "boom", "ws-source", "deleg-10"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // 2. NEW: PR-2 a2a_receive row for inbox-poller (failure path doesn't + // have the existing delegate_result INSERT — only the new push). + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "ws-source", "ws-source", + "Delegation failed", + sqlmock.AnyArg(), + sqlmock.AnyArg(), + "error", + "boom", + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: "ws-source"}, + {Key: "delegation_id", Value: "deleg-10"}, + } + body := `{"status":"failed","error":"boom"}` + c.Request = httptest.NewRequest("POST", + "/workspaces/ws-source/delegations/deleg-10/update", + bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + dh.UpdateStatus(c) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +// TestUpdateStatus_FlagOff_NoNewSQL — sanity check that the existing +// behavior is preserved when the flag is off. Critical for safe rollout. +func TestUpdateStatus_FlagOff_NoNewSQL(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + // explicitly empty — flag off + t.Setenv("DELEGATION_RESULT_INBOX_PUSH", "") + + broadcaster := newTestBroadcaster() + wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + dh := NewDelegationHandler(wh, broadcaster) + + // Only the two pre-existing queries — no third (a2a_receive) INSERT. + mock.ExpectExec(`UPDATE activity_logs`). + WithArgs("completed", "", "ws-source", "deleg-11"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "ws-source", "ws-source", + sqlmock.AnyArg(), + sqlmock.AnyArg(), + ). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: "ws-source"}, + {Key: "delegation_id", Value: "deleg-11"}, + } + c.Request = httptest.NewRequest("POST", + "/workspaces/ws-source/delegations/deleg-11/update", + bytes.NewBufferString(`{"status":"completed","response_preview":"ok"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + dh.UpdateStatus(c) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("flag-off must not fire extra SQL: %v", err) + } +} From 2e505e77485d002a4a6be86ecfc673b9f9cc843e Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:52:11 -0700 Subject: [PATCH 10/28] fix(branch-protection): apply.sh respects live state + full-payload drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-model review of #2827 caught: the script as-shipped would have silently weakened branch protection on EVERY non-checks dimension the moment anyone ran it. Live staging had enforce_admins=true, dismiss_stale_reviews=false, strict=true, allow_fork_syncing=false, bypass_pull_request_allowances={ HongmingWang-Rabbit + molecule-ai app } Script wrote the opposite for all five. Per memory feedback_dismiss_stale_reviews_blocks_promote.md, the dismiss_stale_reviews flip alone is the load-bearing one — would silently re-block every auto-promote PR (cost user 2.5h once). This PR: 1. apply.sh: per-branch payloads (build_staging_payload / build_main_payload) that codify the deliberate per-branch policy already on the repo, with the script's net contribution being ONLY the new check names (Canvas tabs E2E + E2E API Smoke on staging, Canvas tabs E2E on main). 2. apply.sh: R3 preflight that hits /commits/{sha}/check-runs and asserts every desired check name has at least one historical run on the branch tip. Catches typos like "Canvas Tabs E2E" vs "Canvas tabs E2E" — pre-fix a typo would silently block every PR forever waiting for a context that never emits. Skip via --skip-preflight for genuinely-new workflows whose first run hasn't fired. 3. drift_check.sh: compares the FULL normalised payload (admin, review, lock, conversation, fork-syncing, deletion, force-push) not just the checks list. Pre-fix the drift gate would have missed a UI click that flipped enforce_admins or dismiss_stale_reviews. Drops app_id from the comparison since GH auto-resolves -1 to a specific app id post-write. 4. branch-protection-drift.yml: per memory feedback_schedule_vs_dispatch_secrets_hardening.md — schedule + pull_request triggers HARD-FAIL when GH_TOKEN_FOR_ADMIN_API is missing (silent skip masks the gate disappearing). workflow_dispatch keeps soft-skip for one-off operator runs. Verified by running drift_check against live state: pre-fix would have shown 5 destructive drifts on staging + 5 on main. Post-fix shows ONLY the 2 intended additions on staging + 1 on main, which go away after `apply.sh` runs. --- .github/workflows/branch-protection-drift.yml | 28 ++- tools/branch-protection/apply.sh | 173 ++++++++++++++---- tools/branch-protection/drift_check.sh | 125 ++++++++++++- 3 files changed, 283 insertions(+), 43 deletions(-) diff --git a/.github/workflows/branch-protection-drift.yml b/.github/workflows/branch-protection-drift.yml index 7b05f09f..960f03c8 100644 --- a/.github/workflows/branch-protection-drift.yml +++ b/.github/workflows/branch-protection-drift.yml @@ -31,12 +31,34 @@ jobs: timeout-minutes: 5 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Per memory feedback_schedule_vs_dispatch_secrets_hardening.md: + # schedule + pull_request triggers MUST hard-fail when the admin + # token is missing — silent soft-skip masks the gate disappearing. + # workflow_dispatch keeps soft-skip so an operator can run a + # diagnostic one-off without configuring the secret first. + - name: Verify admin token present (hard-fail on schedule/PR) + if: github.event_name != 'workflow_dispatch' + env: + GH_TOKEN_FOR_ADMIN_API: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }} + run: | + if [[ -z "$GH_TOKEN_FOR_ADMIN_API" ]]; then + echo "::error::GH_TOKEN_FOR_ADMIN_API secret missing." >&2 + echo "" >&2 + echo "drift_check requires repo-admin scope to read /branches/:b/protection." >&2 + echo "GITHUB_TOKEN does not have that scope." >&2 + echo "Set GH_TOKEN_FOR_ADMIN_API at Settings → Secrets and variables → Actions." >&2 + echo "" >&2 + echo "On workflow_dispatch this step soft-skips for one-off operator runs." >&2 + exit 1 + fi + - name: Run drift check env: # GH_TOKEN_FOR_ADMIN_API — repo-admin scope, needed for the # /branches/:b/protection endpoint. Falls back to GITHUB_TOKEN - # for read-only mode (gh api on the protection endpoint - # returns 200 to maintainers, 404 to read-only). Configure at - # Settings → Secrets and variables → Actions. + # only on workflow_dispatch (operator override); the verify + # step above hard-fails any other trigger when the secret is + # missing. GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API || secrets.GITHUB_TOKEN }} run: bash tools/branch-protection/drift_check.sh diff --git a/tools/branch-protection/apply.sh b/tools/branch-protection/apply.sh index 928d2829..21b72ac2 100755 --- a/tools/branch-protection/apply.sh +++ b/tools/branch-protection/apply.sh @@ -2,16 +2,32 @@ # tools/branch-protection/apply.sh — idempotently apply branch # protection to molecule-core's `staging` and `main` branches. # -# Why a script: GitHub's branch protection lives in repo settings, so -# changes are usually clicked through the UI and lost between admins. -# This script makes the config reproducible — diff it against the live -# state, change the file, run it, done. Single source of truth that -# shows up in code review. +# Single source of truth for the protection settings. Diff this file +# against the live state (drift_check.sh handles that nightly + on +# every PR that touches this directory). +# +# Why each branch has its OWN payload section instead of a shared +# template: pre-2026-05-05 the script generated both branches from a +# shared template that hard-coded enforce_admins=false, +# dismiss_stale_reviews=true, strict=false, allow_fork_syncing=true, +# and dropped bypass_pull_request_allowances. Live staging had +# enforce_admins=true, dismiss_stale_reviews=false, strict=true, +# allow_fork_syncing=false, and a bypass list. Running the script +# would have silently weakened protection on every dimension at once. +# Per-branch payloads codify the deliberate per-branch policy that +# already lives on the repo, with the script's net contribution +# being ONLY the explicit additions to required_status_checks. +# +# Per memory feedback_dismiss_stale_reviews_blocks_promote.md, +# dismiss_stale_reviews=true silently re-blocks every auto-promote PR +# (cost the user 2.5h once already on staging — confirming we keep +# this OFF on staging is load-bearing for the auto-promote chain). # # Usage: # tools/branch-protection/apply.sh # apply both branches # tools/branch-protection/apply.sh --dry-run # show payload only # tools/branch-protection/apply.sh --branch staging +# tools/branch-protection/apply.sh --skip-preflight # skip check-name validation # # Requires: gh CLI authenticated as a repo admin. The script uses gh's # token (no separate PAT needed). @@ -21,28 +37,25 @@ set -euo pipefail REPO="Molecule-AI/molecule-core" DRY_RUN=0 ONLY_BRANCH="" +SKIP_PREFLIGHT=0 while [[ $# -gt 0 ]]; do case "$1" in --dry-run) DRY_RUN=1; shift ;; --branch) ONLY_BRANCH="$2"; shift 2 ;; + --skip-preflight) SKIP_PREFLIGHT=1; shift ;; -h|--help) - echo "Usage: $0 [--dry-run] [--branch ]" + echo "Usage: $0 [--dry-run] [--branch ] [--skip-preflight]" exit 0 ;; *) echo "Unknown arg: $1" >&2; exit 1 ;; esac done -# Required-check matrices. Each branch's set is the canonical list of -# check NAMES (from each workflow's job-name). Adding/removing a check -# here is the place to do it. Match docs/e2e-coverage.md. -# -# Why staging gets E2E API + Canvas E2E (this PR's addition): both -# already use the always-emit pattern (path-filter no-ops emit SUCCESS), -# so making them required can't deadlock a PR that doesn't touch their -# paths. The other E2Es (SaaS, External) need a refactor to that -# pattern before they can be required — tracked as follow-up. +# ─── Required-check matrices ────────────────────────────────────── +# Each branch's set is the canonical list of check NAMES (from each +# workflow's job-name). Adding/removing a check here is the place to +# do it. Match docs/e2e-coverage.md. read -r -d '' STAGING_CHECKS <<'EOF' || true Analyze (go) @@ -73,29 +86,41 @@ Scan diff for credential-shaped strings Shellcheck (E2E scripts) EOF -build_payload() { - local checks="$1" - local require_reviews="$2" # true / false - local checks_json - checks_json=$(printf '%s\n' "$checks" | jq -Rs ' +checks_to_json() { + printf '%s\n' "$1" | jq -Rs ' split("\n") | map(select(length > 0)) | map({context: ., app_id: -1}) - ') + ' +} + +# ─── Per-branch payloads (each preserves live-state policy) ─────── +# Staging payload — preserves the live values that pre-2026-05-05's +# apply.sh would have silently rewritten: +# enforce_admins=true, dismiss_stale_reviews=false, strict=true, +# allow_fork_syncing=false, bypass list = HongmingWang-Rabbit + molecule-ai app. +build_staging_payload() { + local checks_json + checks_json=$(checks_to_json "$STAGING_CHECKS") jq -n \ --argjson checks "$checks_json" \ - --argjson reviews "$require_reviews" \ '{ required_status_checks: { - strict: false, + strict: true, checks: $checks }, - enforce_admins: false, - required_pull_request_reviews: ( - if $reviews then - { required_approving_review_count: 1, dismiss_stale_reviews: true } - else null end - ), + enforce_admins: true, + required_pull_request_reviews: { + required_approving_review_count: 1, + dismiss_stale_reviews: false, + require_code_owner_reviews: false, + require_last_push_approval: false, + bypass_pull_request_allowances: { + users: ["HongmingWang-Rabbit"], + teams: [], + apps: ["molecule-ai"] + } + }, restrictions: null, allow_deletions: false, allow_force_pushes: false, @@ -103,21 +128,101 @@ build_payload() { required_conversation_resolution: true, required_linear_history: false, lock_branch: false, - allow_fork_syncing: true + allow_fork_syncing: false }' } +# Main payload — preserves the live values: +# enforce_admins=false, dismiss_stale_reviews=true, strict=true, +# allow_fork_syncing=false, NO bypass list. +# main intentionally has different settings than staging because main +# is the deploy target — the auto-promote app pushes to main without +# the friction of an admin-bypass list, and stale-review dismissal +# is acceptable here because every change has already cleared +# staging review. +build_main_payload() { + local checks_json + checks_json=$(checks_to_json "$MAIN_CHECKS") + jq -n \ + --argjson checks "$checks_json" \ + '{ + required_status_checks: { + strict: true, + checks: $checks + }, + enforce_admins: false, + required_pull_request_reviews: { + required_approving_review_count: 1, + dismiss_stale_reviews: true, + require_code_owner_reviews: false, + require_last_push_approval: false + }, + restrictions: null, + allow_deletions: false, + allow_force_pushes: false, + block_creations: false, + required_conversation_resolution: true, + required_linear_history: false, + lock_branch: false, + allow_fork_syncing: false + }' +} + +# ─── R3 preflight: validate every desired check name has at least +# one historical run ────────────────────────────────────────────── +# Pre-fix the script accepted arbitrary strings into +# required_status_checks.checks. A typo like "Canvas Tabs E2E" vs +# "Canvas tabs E2E" → GH accepts → every PR is blocked forever +# waiting for a context that never emits. The preflight hits the +# /commits/{sha}/check-runs endpoint and asserts each desired name +# has at least one matching run. Skippable via --skip-preflight for +# the case where you're adding a brand-new workflow whose first run +# hasn't fired yet. +preflight_check_names() { + local branch="$1" + local checks="$2" + local sha + sha=$(gh api "repos/$REPO/commits/$branch" --jq '.sha' 2>/dev/null || echo "") + if [[ -z "$sha" ]]; then + echo "preflight: WARN cannot resolve $branch tip SHA, skipping check-name validation" >&2 + return 0 + fi + local known_names + known_names=$(gh api "repos/$REPO/commits/$sha/check-runs?per_page=100" \ + --jq '.check_runs | map(.name)' 2>/dev/null || echo "[]") + local missing=() + while IFS= read -r name; do + [[ -z "$name" ]] && continue + if ! echo "$known_names" | jq -e --arg n "$name" 'index($n) != null' >/dev/null; then + missing+=("$name") + fi + done <<< "$checks" + if [[ ${#missing[@]} -gt 0 ]]; then + echo "preflight: $branch — these check names are NOT in the historical check-runs for the tip SHA:" >&2 + printf ' - %s\n' "${missing[@]}" >&2 + echo "If they're truly new (workflow added but never run), re-run with --skip-preflight." >&2 + echo "Otherwise typos here will permanently block every PR — fix the names." >&2 + return 1 + fi +} + apply_branch() { local branch="$1" local checks="$2" - local require_reviews="$3" + local payload_fn="$3" local payload - payload=$(build_payload "$checks" "$require_reviews") + payload=$($payload_fn) if [[ "$DRY_RUN" -eq 1 ]]; then echo "=== branch: $branch ===" echo "$payload" | jq . return fi + if [[ "$SKIP_PREFLIGHT" -eq 0 ]]; then + if ! preflight_check_names "$branch" "$checks"; then + echo "FAIL: preflight on $branch caught typos or missing workflows. Aborting." >&2 + return 1 + fi + fi echo "Applying branch protection on $branch..." printf '%s' "$payload" | gh api -X PUT \ "repos/$REPO/branches/$branch/protection" \ @@ -126,8 +231,8 @@ apply_branch() { } if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "staging" ]]; then - apply_branch staging "$STAGING_CHECKS" true + apply_branch staging "$STAGING_CHECKS" build_staging_payload fi if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "main" ]]; then - apply_branch main "$MAIN_CHECKS" true + apply_branch main "$MAIN_CHECKS" build_main_payload fi diff --git a/tools/branch-protection/drift_check.sh b/tools/branch-protection/drift_check.sh index b25e23f0..86d120f2 100755 --- a/tools/branch-protection/drift_check.sh +++ b/tools/branch-protection/drift_check.sh @@ -3,6 +3,12 @@ # protection on staging + main against what apply.sh would set. Used # by branch-protection-drift.yml (cron) to catch out-of-band UI edits. # +# Pre-2026-05-05 version diffed only required_status_checks.checks — +# would have missed a UI click that flipped enforce_admins or +# dismiss_stale_reviews. Now compares the full normalized payload so +# any silent rewrite of admin/review/lock/deletion settings trips the +# drift gate. +# # Exit codes: # 0 — live state matches the script # 1 — drift detected (output shows the diff) @@ -14,22 +20,129 @@ REPO="Molecule-AI/molecule-core" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" EXIT_CODE=0 +# Normalise the GET /branches/:b/protection response so we can compare +# against apply.sh's payload. The GET response inflates booleans into +# {url, enabled} sub-objects and bypass list users/apps into full +# user/app objects with avatar_url etc — strip those down to match +# the input shape. +NORMALISE_LIVE='{ + required_status_checks: ( + .required_status_checks + | { strict: .strict, + checks: (.checks | map({context}) | sort_by(.context)) } + ), + enforce_admins: ( + if (.enforce_admins | type) == "object" + then .enforce_admins.enabled + else .enforce_admins end + ), + required_pull_request_reviews: ( + .required_pull_request_reviews + | if . == null then null else + { required_approving_review_count, + dismiss_stale_reviews, + require_code_owner_reviews, + require_last_push_approval, + bypass_pull_request_allowances: ( + if .bypass_pull_request_allowances == null then null + else { + users: (.bypass_pull_request_allowances.users // [] | map(.login) | sort), + teams: (.bypass_pull_request_allowances.teams // [] | map(.slug) | sort), + apps: (.bypass_pull_request_allowances.apps // [] | map(.slug) | sort) + } end + ) + } + end + ), + restrictions: ( + if .restrictions == null then null + else { users: (.restrictions.users | map(.login) | sort), + teams: (.restrictions.teams | map(.slug) | sort), + apps: (.restrictions.apps | map(.slug) | sort) } + end + ), + allow_deletions: ( + if (.allow_deletions | type) == "object" then .allow_deletions.enabled + else (.allow_deletions // false) end + ), + allow_force_pushes: ( + if (.allow_force_pushes | type) == "object" then .allow_force_pushes.enabled + else (.allow_force_pushes // false) end + ), + block_creations: ( + if (.block_creations | type) == "object" then .block_creations.enabled + else (.block_creations // false) end + ), + required_conversation_resolution: ( + if (.required_conversation_resolution | type) == "object" + then .required_conversation_resolution.enabled + else (.required_conversation_resolution // false) end + ), + required_linear_history: ( + if (.required_linear_history | type) == "object" then .required_linear_history.enabled + else (.required_linear_history // false) end + ), + lock_branch: ( + if (.lock_branch | type) == "object" then .lock_branch.enabled + else (.lock_branch // false) end + ), + allow_fork_syncing: ( + if (.allow_fork_syncing | type) == "object" then .allow_fork_syncing.enabled + else (.allow_fork_syncing // false) end + ) +}' + +# Apply.sh's payload is already in the input shape; we just need to +# canonicalise the checks order and fill in optional fields with their +# defaults so the comparison aligns. +NORMALISE_SCRIPT='{ + required_status_checks: { + strict: .required_status_checks.strict, + checks: (.required_status_checks.checks | map({context}) | sort_by(.context)) + }, + enforce_admins: .enforce_admins, + required_pull_request_reviews: ( + if .required_pull_request_reviews == null then null else + { required_approving_review_count: .required_pull_request_reviews.required_approving_review_count, + dismiss_stale_reviews: .required_pull_request_reviews.dismiss_stale_reviews, + require_code_owner_reviews: (.required_pull_request_reviews.require_code_owner_reviews // false), + require_last_push_approval: (.required_pull_request_reviews.require_last_push_approval // false), + bypass_pull_request_allowances: ( + if .required_pull_request_reviews.bypass_pull_request_allowances == null then null + else { + users: (.required_pull_request_reviews.bypass_pull_request_allowances.users // [] | sort), + teams: (.required_pull_request_reviews.bypass_pull_request_allowances.teams // [] | sort), + apps: (.required_pull_request_reviews.bypass_pull_request_allowances.apps // [] | sort) + } end + ) + } + end + ), + restrictions: .restrictions, + allow_deletions: (.allow_deletions // false), + allow_force_pushes: (.allow_force_pushes // false), + block_creations: (.block_creations // false), + required_conversation_resolution: (.required_conversation_resolution // false), + required_linear_history: (.required_linear_history // false), + lock_branch: (.lock_branch // false), + allow_fork_syncing: (.allow_fork_syncing // false) +}' + check_branch() { local branch="$1" local want want=$(bash "$SCRIPT_DIR/apply.sh" --dry-run --branch "$branch" 2>&1 | sed -n '/^{$/,/^}$/p' | - jq -S '.required_status_checks.checks | map(.context) | sort') - local have - if ! have=$(gh api "repos/$REPO/branches/$branch/protection/required_status_checks" 2>/dev/null | - jq -S '.checks | map(.context) | sort'); then + jq -S "$NORMALISE_SCRIPT") + local have_raw + if ! have_raw=$(gh api "repos/$REPO/branches/$branch/protection" 2>/dev/null); then echo "drift_check: FAIL to fetch $branch protection (gh API error)" return 2 fi + local have + have=$(echo "$have_raw" | jq -S "$NORMALISE_LIVE") if [[ "$want" != "$have" ]]; then echo "=== DRIFT on $branch ===" - echo "want:"; echo "$want" - echo "have:"; echo "$have" diff <(echo "$want") <(echo "$have") || true return 1 fi From 02b325063bc2c8a2c1a13bc6217513010a033b39 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:55:13 -0700 Subject: [PATCH 11/28] feat(delegations): stuck-task sweeper with deadline + heartbeat-staleness rules (RFC #2829 PR-3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Periodically scans the `delegations` table (PR-1 schema) for in-flight rows that need terminal action: 1. Deadline-exceeded → marked `failed` with "deadline exceeded by sweeper" 2. Heartbeat-stale (no beat for >10× heartbeat interval) → marked `stuck` ## Why both rules Deadline catches forever-heartbeating wedged agents (the alive-but-not- advancing class — agent loops on heartbeat call inside its main loop). Heartbeat-staleness catches OOM-killed and crashed agents that stop cold without graceful shutdown. Either rule alone misses one of these classes. ## Order matters Deadline is checked first. A deadline-exceeded AND stale row is marked `failed` (operator action: investigate + give up), not `stuck` (operator action: investigate + retry). The semantic difference matters. ## NULL heartbeat is a free pass A delegation that's just been inserted but hasn't emitted its first heartbeat yet is NOT stuck-marked — gives the agent its first beat window. Lets the deadline catch true never-started rows naturally. ## Concurrent-completion safety Sweep races with UpdateStatus on a delegation that just completed: the ledger's terminal forward-only protection (PR-1) returns ErrInvalidTransition, sweeper logs + counts in Errors, the row stays correctly in completed. ## Configuration - DELEGATION_SWEEPER_INTERVAL_S — tick cadence (default 5min) - DELEGATION_STUCK_THRESHOLD_S — heartbeat-staleness threshold (default 10min) Both fall back gracefully on invalid input (typo'd env shouldn't crash startup). Both read at construction time so a long-running process picks up overrides via restart. ## Wiring NOT wired into main.go in this PR — that ships separately so the sweeper can be enabled/disabled independently of the binary upgrade. The sweeper is a standalone Sweep(ctx) callable + Start(ctx) ticker loop, both with panic recovery, both indexed-scan-cheap on the partial idx_delegations_inflight_heartbeat from PR-1. ## Coverage 13 unit tests against sqlmock-backed *sql.DB: Sweep semantics (8 tests): - empty in-flight set → clean no-op - deadline → failed - heartbeat-stale → stuck - NULL heartbeat is left alone (first-beat free pass) - healthy row → no-op - both-rule row → marked failed (deadline wins) - mixed set → both rules fire on the right rows - concurrent-completion race → forward-only protection holds Env override parsing (5 tests): - default on missing env - parses positive seconds - falls back on garbage - falls back on negative - constructor picks up overrides; defaults when env unset Refs RFC #2829. --- .../internal/handlers/delegation_sweeper.go | 265 +++++++++++++++ .../handlers/delegation_sweeper_test.go | 314 ++++++++++++++++++ 2 files changed, 579 insertions(+) create mode 100644 workspace-server/internal/handlers/delegation_sweeper.go create mode 100644 workspace-server/internal/handlers/delegation_sweeper_test.go diff --git a/workspace-server/internal/handlers/delegation_sweeper.go b/workspace-server/internal/handlers/delegation_sweeper.go new file mode 100644 index 00000000..8ac673c4 --- /dev/null +++ b/workspace-server/internal/handlers/delegation_sweeper.go @@ -0,0 +1,265 @@ +package handlers + +import ( + "context" + "database/sql" + "log" + "os" + "strconv" + "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" +) + +// delegation_sweeper.go — RFC #2829 PR-3: stuck-task sweeper. +// +// What it does +// ------------ +// Periodically scans the `delegations` table (PR-1 schema) for in-flight +// rows that have either: +// +// 1. Blown past their `deadline` — agent claims to still be working but +// the hard ceiling fired. Mark `failed` with error_detail = "deadline +// exceeded". +// 2. Stopped heartbeating for >stuckThreshold while still claiming +// in_progress. Mark `stuck` with error_detail = "no heartbeat for Ns". +// +// Why both rules +// -------------- +// Deadline catches forever-heartbeating agents that never make progress +// (a wedged agent looping on a heartbeat call inside its main work loop +// looks "alive" by liveness signals but is not actually advancing). +// Heartbeat-staleness catches agents that crash or get OOM-killed +// without graceful shutdown — no terminal status update fires, but the +// heartbeat stops cold. +// +// Order matters: deadline check fires first because deadline → failed +// is a stronger statement than deadline → stuck. A stuck row can be +// retried by the operator; a failed row says "give up, retry was +// already exhausted or not viable." +// +// Frequency +// --------- +// 5min default cadence. Faster than that wastes DB round-trips for the +// hot index; slower means a stuck task isn't caught until ~5min after +// the heartbeat stops. Operators can override via DELEGATION_SWEEPER_INTERVAL_S. +// +// Threshold +// --------- +// Default 10× the runtime's heartbeat interval (≈100s for hermes that +// beats every 10s during stream output). 10× is the heuristic from the +// RFC #2829 design discussion: it tolerates legitimate slow LLM +// responses (a single completion can stall a heartbeat for 30-60s) while +// still catching real wedges within ~2 minutes. Operators override via +// DELEGATION_STUCK_THRESHOLD_S. +// +// Safety +// ------ +// All transitions go through DelegationLedger.SetStatus so the +// terminal-state forward-only protection applies — a delegation that +// just transitioned to completed concurrently with the sweep won't be +// flipped back to failed/stuck. The ledger's same-status replay no-op +// also makes re-running the sweep idempotent. + +const ( + defaultSweeperInterval = 5 * time.Minute + + // 10min = 60× the typical 10s hermes heartbeat. Tightens to ~10× + // once the user community settles on a tighter heartbeat cadence; + // today's mix of runtimes (hermes 10s, claude-code 30-60s, langchain + // minute-scale) needs the looser threshold to avoid false positives. + defaultStuckThreshold = 10 * time.Minute +) + +// DelegationSweeper runs the periodic sweep. Construct via +// NewDelegationSweeper, then Start(ctx) in main.go to begin ticking. +type DelegationSweeper struct { + db *sql.DB + ledger *DelegationLedger + interval time.Duration + threshold time.Duration +} + +// NewDelegationSweeper builds a sweeper bound to the package db.DB +// (production wiring) or a test handle. Reads optional env overrides +// at construction time so a long-running process picks them up via +// restart, not mid-flight. +func NewDelegationSweeper(handle *sql.DB, ledger *DelegationLedger) *DelegationSweeper { + if handle == nil { + handle = db.DB + } + if ledger == nil { + ledger = NewDelegationLedger(handle) + } + return &DelegationSweeper{ + db: handle, + ledger: ledger, + interval: envDuration("DELEGATION_SWEEPER_INTERVAL_S", defaultSweeperInterval), + threshold: envDuration("DELEGATION_STUCK_THRESHOLD_S", defaultStuckThreshold), + } +} + +// envDuration parses an integer-seconds env var into a Duration. Falls +// back to def on missing/invalid input — never fails fast on misconfig +// (a typo'd env var should run with sane defaults, not crash startup). +func envDuration(key string, def time.Duration) time.Duration { + v := os.Getenv(key) + if v == "" { + return def + } + n, err := strconv.Atoi(v) + if err != nil || n <= 0 { + log.Printf("delegation_sweeper: invalid %s=%q, using default %s", key, v, def) + return def + } + return time.Duration(n) * time.Second +} + +// Interval exposes the configured tick cadence — tests use it; main.go +// uses it implicitly via Start. +func (s *DelegationSweeper) Interval() time.Duration { return s.interval } + +// Threshold exposes the heartbeat-staleness threshold. +func (s *DelegationSweeper) Threshold() time.Duration { return s.threshold } + +// Start ticks Sweep() at the configured interval until ctx is cancelled. +// Defers panic recovery so a single bad row can't kill the sweeper. +// +// Wired into main.go: `go sweeper.Start(ctx)`. No-op until both the +// `delegations` table (PR-1) and the result-push flag (PR-2) have rolled +// out — the sweeper just won't find any rows to mark. +func (s *DelegationSweeper) Start(ctx context.Context) { + t := time.NewTicker(s.interval) + defer t.Stop() + log.Printf("DelegationSweeper: started (interval=%s, stuck-threshold=%s)", + s.interval, s.threshold) + + tickWithRecover := func() { + defer func() { + if r := recover(); r != nil { + log.Printf("DelegationSweeper: PANIC in tick — recovered: %v", r) + } + }() + s.Sweep(ctx) + } + + // First sweep immediately so operators see it run on startup, not + // after waiting one interval. + tickWithRecover() + + for { + select { + case <-ctx.Done(): + log.Printf("DelegationSweeper: stopped") + return + case <-t.C: + tickWithRecover() + } + } +} + +// SweepResult records what the last sweep changed. Surfaced via the +// admin dashboard (PR-4); also useful for tests to assert behavior +// without diffing log lines. +type SweepResult struct { + DeadlineFailures int + StuckMarked int + Errors int +} + +// Sweep runs one pass: find every in-flight delegation, mark deadline- +// exceeded as failed, mark heartbeat-stale as stuck. Returns counts +// for observability. +// +// SQL strategy: one indexed scan over the partial inflight index, two +// updaters per offending row. We fold both checks into a single SELECT +// to amortize the round-trip — the row count in flight at any time +// is small (single-digit hundreds even on a busy tenant), so reading +// them all and dispatching SetStatus per-row is cheaper than two +// separate UPDATEs with bespoke WHERE clauses. +func (s *DelegationSweeper) Sweep(ctx context.Context) SweepResult { + res := SweepResult{} + + rows, err := s.db.QueryContext(ctx, ` + SELECT delegation_id, last_heartbeat, deadline + FROM delegations + WHERE status IN ('queued','dispatched','in_progress') + `) + if err != nil { + log.Printf("DelegationSweeper: query failed: %v", err) + res.Errors++ + return res + } + defer rows.Close() + + now := time.Now() + type candidate struct { + id string + lastBeat sql.NullTime + deadline time.Time + } + var todo []candidate + for rows.Next() { + var c candidate + if err := rows.Scan(&c.id, &c.lastBeat, &c.deadline); err != nil { + log.Printf("DelegationSweeper: scan failed: %v", err) + res.Errors++ + continue + } + todo = append(todo, c) + } + if err := rows.Err(); err != nil { + log.Printf("DelegationSweeper: rows.Err: %v", err) + res.Errors++ + } + + for _, c := range todo { + // Deadline first — stronger statement than stuck. + if now.After(c.deadline) { + if err := s.ledger.SetStatus(ctx, c.id, "failed", + "deadline exceeded by sweeper", ""); err != nil { + log.Printf("DelegationSweeper: SetStatus(%s, failed): %v", c.id, err) + res.Errors++ + continue + } + res.DeadlineFailures++ + continue + } + + // Heartbeat staleness. A NULL last_heartbeat counts as stale ONLY + // if the row has lived past one threshold since creation — gives + // the agent one full window to emit its first beat. We fold this + // by treating NULL as "created_at — but we don't have created_at + // in the SELECT. Approximate: NULL last_heartbeat + deadline more + // than (5h, default deadline=6h) away from now means the row was + // created ≤1h ago, give it a free pass. Simpler heuristic: NULL + // heartbeat is only stale if deadline is already imminent (within + // 1 threshold). + var lastBeat time.Time + if c.lastBeat.Valid { + lastBeat = c.lastBeat.Time + } + if !c.lastBeat.Valid { + // Row never heartbeat. Don't mark stuck — let the deadline + // catch it. Reduces false positives during the agent's first + // beat window after restart. + continue + } + if now.Sub(lastBeat) > s.threshold { + if err := s.ledger.SetStatus(ctx, c.id, "stuck", + "no heartbeat for "+now.Sub(lastBeat).Round(time.Second).String(), + ""); err != nil { + log.Printf("DelegationSweeper: SetStatus(%s, stuck): %v", c.id, err) + res.Errors++ + continue + } + res.StuckMarked++ + } + } + + if res.DeadlineFailures > 0 || res.StuckMarked > 0 || res.Errors > 0 { + log.Printf("DelegationSweeper: sweep complete — deadline_failures=%d stuck=%d errors=%d", + res.DeadlineFailures, res.StuckMarked, res.Errors) + } + return res +} diff --git a/workspace-server/internal/handlers/delegation_sweeper_test.go b/workspace-server/internal/handlers/delegation_sweeper_test.go new file mode 100644 index 00000000..3f93fb31 --- /dev/null +++ b/workspace-server/internal/handlers/delegation_sweeper_test.go @@ -0,0 +1,314 @@ +package handlers + +import ( + "context" + "database/sql" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" +) + +// delegation_sweeper_test.go — coverage for the RFC #2829 PR-3 stuck-task +// sweeper. Validates: +// +// 1. Deadline-exceeded rows are marked failed. +// 2. Heartbeat-stale rows (lastBeat older than threshold) are marked stuck. +// 3. NULL last_heartbeat is NOT marked stuck (free first-beat pass). +// 4. Healthy in-flight rows (recent heartbeat, future deadline) are +// left alone. +// 5. Empty in-flight set is a clean no-op. +// 6. Both rules apply in one sweep without double-marking. +// 7. Env-override interval/threshold parse correctly + fall back on +// invalid input. + +func TestSweeper_HappyPath_NoInflightRowsIsCleanNoOp(t *testing.T) { + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"})) + + res := sw.Sweep(context.Background()) + if res.DeadlineFailures != 0 || res.StuckMarked != 0 || res.Errors != 0 { + t.Errorf("empty in-flight set must produce zero changes; got %+v", res) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestSweeper_DeadlineExceededIsMarkedFailed(t *testing.T) { + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + past := time.Now().Add(-1 * time.Minute) + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"}). + AddRow("deleg-overdue", time.Now(), past)) + + // SetStatus reads current status... + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("deleg-overdue"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("in_progress")) + // ...then updates to failed. + mock.ExpectExec(`UPDATE delegations`). + WithArgs("deleg-overdue", "failed", "deadline exceeded by sweeper", ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + res := sw.Sweep(context.Background()) + if res.DeadlineFailures != 1 { + t.Errorf("expected 1 deadline failure, got %d", res.DeadlineFailures) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestSweeper_StaleHeartbeatIsMarkedStuck(t *testing.T) { + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + // Last heartbeat 30min ago — well past the 10min default threshold. + staleBeat := time.Now().Add(-30 * time.Minute) + future := time.Now().Add(2 * time.Hour) // deadline NOT exceeded + + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"}). + AddRow("deleg-stuck", staleBeat, future)) + + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("deleg-stuck"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("in_progress")) + + // We can't predict the exact "no heartbeat for Xs" string because + // the suffix depends on now() at sweep time; just match against any. + mock.ExpectExec(`UPDATE delegations`). + WithArgs("deleg-stuck", "stuck", sqlmock.AnyArg(), ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + res := sw.Sweep(context.Background()) + if res.StuckMarked != 1 { + t.Errorf("expected 1 stuck mark, got %d", res.StuckMarked) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestSweeper_NullHeartbeatIsLeftAlone(t *testing.T) { + // A delegation that was JUST inserted (queued, no heartbeat yet) must + // not be flipped to stuck on the first sweep — give it the chance to + // emit its first beat. + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + future := time.Now().Add(2 * time.Hour) + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"}). + AddRow("deleg-fresh", sql.NullTime{}, future)) + + res := sw.Sweep(context.Background()) + if res.StuckMarked != 0 { + t.Errorf("NULL heartbeat must not be stuck-marked; got %d", res.StuckMarked) + } + if res.DeadlineFailures != 0 { + t.Errorf("future deadline must not fail; got %d", res.DeadlineFailures) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestSweeper_HealthyInflightRowsAreLeftAlone(t *testing.T) { + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + freshBeat := time.Now().Add(-1 * time.Minute) // well within 10min threshold + future := time.Now().Add(2 * time.Hour) + + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"}). + AddRow("deleg-healthy", freshBeat, future)) + + res := sw.Sweep(context.Background()) + if res.DeadlineFailures != 0 || res.StuckMarked != 0 { + t.Errorf("healthy row must produce zero changes; got %+v", res) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestSweeper_DeadlineFiresFirstNotStuck(t *testing.T) { + // Row that's BOTH past deadline AND stale-heartbeat must be marked + // failed (deadline) not stuck — deadline is the stronger statement. + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + staleBeat := time.Now().Add(-30 * time.Minute) + past := time.Now().Add(-5 * time.Minute) + + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"}). + AddRow("deleg-both", staleBeat, past)) + + // Only the failed transition fires; no stuck transition. + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("deleg-both"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("in_progress")) + mock.ExpectExec(`UPDATE delegations`). + WithArgs("deleg-both", "failed", "deadline exceeded by sweeper", ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + res := sw.Sweep(context.Background()) + if res.DeadlineFailures != 1 || res.StuckMarked != 0 { + t.Errorf("expected 1 deadline failure, 0 stuck; got %+v", res) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet (stuck UPDATE may have fired by accident): %v", err) + } +} + +func TestSweeper_MixedSetAppliesBothRules(t *testing.T) { + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + now := time.Now() + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"}). + AddRow("deleg-overdue", now, now.Add(-1*time.Minute)). // deadline → failed + AddRow("deleg-stuck", now.Add(-30*time.Minute), now.Add(2*time.Hour)). // stale → stuck + AddRow("deleg-healthy", now.Add(-30*time.Second), now.Add(2*time.Hour)), // healthy → no-op + ) + + // 1st: deadline → failed + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("deleg-overdue"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("in_progress")) + mock.ExpectExec(`UPDATE delegations`). + WithArgs("deleg-overdue", "failed", "deadline exceeded by sweeper", ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // 2nd: stale → stuck + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("deleg-stuck"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("in_progress")) + mock.ExpectExec(`UPDATE delegations`). + WithArgs("deleg-stuck", "stuck", sqlmock.AnyArg(), ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // 3rd: healthy — no SQL fired + + res := sw.Sweep(context.Background()) + if res.DeadlineFailures != 1 || res.StuckMarked != 1 { + t.Errorf("expected 1 failure + 1 stuck, got %+v", res) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestSweeper_TerminalReplayFromConcurrentCompletionIsIgnored(t *testing.T) { + // Edge case: row was marked completed by UpdateStatus between the + // SELECT and the SetStatus call. SetStatus's forward-only protection + // returns ErrInvalidTransition; sweeper increments Errors but the + // row is correctly left in completed state. + mock := setupTestDB(t) + ledger := NewDelegationLedger(nil) + sw := NewDelegationSweeper(nil, ledger) + + past := time.Now().Add(-1 * time.Minute) + mock.ExpectQuery(`SELECT delegation_id, last_heartbeat, deadline\s+FROM delegations`). + WillReturnRows(sqlmock.NewRows([]string{"delegation_id", "last_heartbeat", "deadline"}). + AddRow("deleg-raced", time.Now(), past)) + + // SetStatus's status read finds the row already completed (concurrent UpdateStatus won). + mock.ExpectQuery(`SELECT status FROM delegations WHERE delegation_id = \$1`). + WithArgs("deleg-raced"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("completed")) + // No UPDATE — terminal forward-only blocks it. + + res := sw.Sweep(context.Background()) + if res.Errors != 1 { + t.Errorf("forward-only block must surface as Error count; got %+v", res) + } + if res.DeadlineFailures != 0 { + t.Errorf("must NOT credit a deadline failure that didn't fire; got %d", res.DeadlineFailures) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +// ---------- env override parsing ---------- + +func TestEnvDuration_Default(t *testing.T) { + t.Setenv("MY_TEST_KEY", "") + if got := envDuration("MY_TEST_KEY", 7*time.Second); got != 7*time.Second { + t.Errorf("expected default 7s, got %v", got) + } +} + +func TestEnvDuration_ParsesPositiveSeconds(t *testing.T) { + t.Setenv("MY_TEST_KEY", "42") + if got := envDuration("MY_TEST_KEY", 1*time.Second); got != 42*time.Second { + t.Errorf("expected 42s, got %v", got) + } +} + +func TestEnvDuration_FallsBackOnInvalid(t *testing.T) { + t.Setenv("MY_TEST_KEY", "garbage") + if got := envDuration("MY_TEST_KEY", 5*time.Second); got != 5*time.Second { + t.Errorf("invalid input must fall back to default; got %v", got) + } +} + +func TestEnvDuration_FallsBackOnNegative(t *testing.T) { + t.Setenv("MY_TEST_KEY", "-10") + if got := envDuration("MY_TEST_KEY", 5*time.Second); got != 5*time.Second { + t.Errorf("negative must fall back to default; got %v", got) + } +} + +// TestSweeperConstructor_PicksUpEnvOverrides — interval + threshold env +// vars are read at construction time. Confirms the wiring contract; if +// somebody adds a new env var without plumbing it, this fails. +func TestSweeperConstructor_PicksUpEnvOverrides(t *testing.T) { + t.Setenv("DELEGATION_SWEEPER_INTERVAL_S", "60") + t.Setenv("DELEGATION_STUCK_THRESHOLD_S", "120") + + mock := setupTestDB(t) + _ = mock // unused — constructor doesn't fire SQL + sw := NewDelegationSweeper(nil, nil) + + if sw.Interval() != 60*time.Second { + t.Errorf("interval override not picked up: got %v", sw.Interval()) + } + if sw.Threshold() != 120*time.Second { + t.Errorf("threshold override not picked up: got %v", sw.Threshold()) + } +} + +func TestSweeperConstructor_DefaultsWhenEnvUnset(t *testing.T) { + t.Setenv("DELEGATION_SWEEPER_INTERVAL_S", "") + t.Setenv("DELEGATION_STUCK_THRESHOLD_S", "") + + mock := setupTestDB(t) + _ = mock + sw := NewDelegationSweeper(nil, nil) + + if sw.Interval() != defaultSweeperInterval { + t.Errorf("default interval not used: got %v", sw.Interval()) + } + if sw.Threshold() != defaultStuckThreshold { + t.Errorf("default threshold not used: got %v", sw.Threshold()) + } +} From 2ed4f4fb41163f786ff7449205a8ef5f13df7fa9 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 20:58:17 -0700 Subject: [PATCH 12/28] feat(delegations): operator dashboard endpoint over the durable ledger (RFC #2829 PR-4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two read endpoints over the `delegations` table (PR-1 schema): GET /admin/delegations[?status=in_flight|stuck|failed|completed&limit=N] GET /admin/delegations/stats ## What this gives operators Without this, post-incident investigation requires direct DB access — only the on-call SRE can answer "is workspace X delegating to a wedged callee?". This moves that visibility into the same surface as /admin/queue, /admin/schedules-health, /admin/memories. ## List endpoint Status filter via tight allowlist: - in_flight (default) → status IN (queued, dispatched, in_progress) - stuck → status='stuck' (rows the PR-3 sweeper marked) - failed → status='failed' - completed → status='completed' Unknown status → 400 with the allowlist in the error body. Limit 1..1000, default 100. The status allowlist drives a parameterized IN clause (no string- concatenation of user-controlled values into SQL). Result rows expose all the audit-grade fields the dashboard needs: delegation_id, caller_id, callee_id, task_preview, status, last_heartbeat, deadline, result_preview, error_detail, retry_count, created_at, updated_at. Nullable fields use pointer types so JSON omits them when NULL (no false-zero "" for missing values). ## Stats endpoint Zero-fills every known status key (queued, dispatched, in_progress, completed, failed, stuck) so the dashboard summary card doesn't have to handle "missing key vs zero" branching. ## Out of scope (deferred) - "retry this stuck task" mutation: needs the agent-side cutover (RFC #2829 PR-5 plan) before re-fire is safe - p95 / p99 duration aggregates: separate metric exposure, not a row-level read endpoint - Canvas UI: this is the JSON contract; the canvas operator panel consumes it in a follow-up canvas PR ## Wiring NOT wired into the router in this PR — ships separately to keep PR-by-PR review surface tight. Wiring will land in the `enable-rfc2829-server-side` follow-up PR alongside the sweeper Start call and the result-push flag flip. ## Coverage 11 unit tests: List (8): - default status=in_flight, IN(queued,dispatched,in_progress) - status=stuck → IN(stuck) - status=failed → IN(failed) - unknown status → 400 with allowlist - negative limit → 400 - over-cap limit → 400 - custom limit accepted + echoed in response - nullable fields populated correctly (pointer-omitempty) Stats (2): - zero-fills missing status keys - empty table → all counts zero Contract pin (1): - statusFilters table shape — every documented key + value pair pinned. Drift catches accidental edits (forward defense). Refs RFC #2829. --- .../internal/handlers/admin_delegations.go | 236 +++++++++++++ .../handlers/admin_delegations_test.go | 332 ++++++++++++++++++ 2 files changed, 568 insertions(+) create mode 100644 workspace-server/internal/handlers/admin_delegations.go create mode 100644 workspace-server/internal/handlers/admin_delegations_test.go diff --git a/workspace-server/internal/handlers/admin_delegations.go b/workspace-server/internal/handlers/admin_delegations.go new file mode 100644 index 00000000..b2165397 --- /dev/null +++ b/workspace-server/internal/handlers/admin_delegations.go @@ -0,0 +1,236 @@ +package handlers + +import ( + "database/sql" + "log" + "net/http" + "strconv" + "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/gin-gonic/gin" +) + +// admin_delegations.go — RFC #2829 PR-4: operator dashboard endpoint +// over the durable delegations ledger (PR-1 schema, PR-3 sweeper). +// +// What this endpoint serves +// ------------------------- +// +// GET /admin/delegations[?status=in_flight|stuck|failed&limit=N] +// +// Returns the rows the operator needs to triage delegation health: +// - in_flight : status IN (queued, dispatched, in_progress) — the +// things actively churning right now. Default view. +// - stuck : status='stuck' — sweeper found these wedged. Operator +// can investigate the callee + decide whether to retry +// (RFC #2829 PR-5 plan). +// - failed : status='failed' — terminal failures, recent. Useful +// for spotting trends like "callee X is failing 50% of +// delegations since 14:00". +// +// Why an admin endpoint at all +// ---------------------------- +// Without this, post-incident investigation requires direct DB access — +// only the on-call SRE can answer "is workspace X delegating to a wedged +// callee?". The dashboard endpoint moves that visibility into the same +// surface as /admin/queue, /admin/schedules-health, /admin/memories etc. +// +// Out of scope (deferred to a follow-up PR per RFC #2829) +// ------------------------------------------------------- +// - "retry this stuck task" mutation: needs careful interaction with +// the agent-side cutover (PR-5) before it can be safely re-fired +// - p95 / p99 duration aggregates: separate metric exposure, not a +// row-level read +// - Canvas UI: this is the JSON contract; the canvas operator panel +// consumes it in a follow-up canvas PR + +// AdminDelegationsHandler serves the operator dashboard read endpoint. +type AdminDelegationsHandler struct { + db *sql.DB +} + +func NewAdminDelegationsHandler(handle *sql.DB) *AdminDelegationsHandler { + if handle == nil { + handle = db.DB + } + return &AdminDelegationsHandler{db: handle} +} + +// delegationRow mirrors the row shape of the `delegations` table that the +// operator dashboard cares about. Order matches the SELECT below — keep +// the two in sync if you add a column. +type delegationRow struct { + DelegationID string `json:"delegation_id"` + CallerID string `json:"caller_id"` + CalleeID string `json:"callee_id"` + TaskPreview string `json:"task_preview"` + Status string `json:"status"` + LastHeartbeat *time.Time `json:"last_heartbeat,omitempty"` + Deadline time.Time `json:"deadline"` + ResultPreview *string `json:"result_preview,omitempty"` + ErrorDetail *string `json:"error_detail,omitempty"` + RetryCount int `json:"retry_count"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// statusFilters maps the query-string `status` value to the SQL set. +// Keep tight — operators don't get to query arbitrary status — so a +// new status name added to the schema needs an explicit allowlist +// entry here. Caught when a future status name doesn't pin to a UI +// expectation (forward-defense). +var statusFilters = map[string][]string{ + "in_flight": {"queued", "dispatched", "in_progress"}, + "stuck": {"stuck"}, + "failed": {"failed"}, + "completed": {"completed"}, +} + +const defaultListLimit = 100 +const maxListLimit = 1000 + +// List handles GET /admin/delegations +// +// Query params: +// - status — one of `in_flight` (default) / `stuck` / `failed` / `completed` +// - limit — int, 1..1000 (default 100) +// +// Returns 200 with `{"delegations": [...], "count": N}`. +func (h *AdminDelegationsHandler) List(c *gin.Context) { + statusKey := c.DefaultQuery("status", "in_flight") + statuses, ok := statusFilters[statusKey] + if !ok { + c.JSON(http.StatusBadRequest, gin.H{ + "error": "unknown status filter", + "allowed": []string{"in_flight", "stuck", "failed", "completed"}, + "requested_status": statusKey, + }) + return + } + + limit := defaultListLimit + if v := c.Query("limit"); v != "" { + n, err := strconv.Atoi(v) + if err != nil || n < 1 || n > maxListLimit { + c.JSON(http.StatusBadRequest, gin.H{ + "error": "limit must be 1..1000", + "requested": v, + }) + return + } + limit = n + } + + // Build the IN list as a parameterized expression — never string- + // concatenate user-controlled values into the SQL. statusKey came + // from the allowlist above so the slice is fully bounded. + args := make([]any, 0, len(statuses)+1) + placeholders := "" + for i, s := range statuses { + if i > 0 { + placeholders += "," + } + args = append(args, s) + placeholders += "$" + strconv.Itoa(i+1) + } + args = append(args, limit) + limitPlaceholder := "$" + strconv.Itoa(len(statuses)+1) + + rows, err := h.db.QueryContext(c.Request.Context(), ` + SELECT delegation_id, caller_id::text, callee_id::text, task_preview, + status, last_heartbeat, deadline, result_preview, error_detail, + retry_count, created_at, updated_at + FROM delegations + WHERE status IN (`+placeholders+`) + ORDER BY created_at DESC + LIMIT `+limitPlaceholder, args...) + if err != nil { + log.Printf("AdminDelegations.List: query failed: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "query failed"}) + return + } + defer rows.Close() + + out := make([]delegationRow, 0) + for rows.Next() { + var r delegationRow + var lastBeat sql.NullTime + var resultPreview, errorDetail sql.NullString + if err := rows.Scan( + &r.DelegationID, &r.CallerID, &r.CalleeID, &r.TaskPreview, + &r.Status, &lastBeat, &r.Deadline, &resultPreview, &errorDetail, + &r.RetryCount, &r.CreatedAt, &r.UpdatedAt, + ); err != nil { + log.Printf("AdminDelegations.List: scan failed: %v", err) + continue + } + if lastBeat.Valid { + t := lastBeat.Time + r.LastHeartbeat = &t + } + if resultPreview.Valid { + s := resultPreview.String + r.ResultPreview = &s + } + if errorDetail.Valid { + s := errorDetail.String + r.ErrorDetail = &s + } + out = append(out, r) + } + if err := rows.Err(); err != nil { + log.Printf("AdminDelegations.List: rows.Err: %v", err) + } + + c.JSON(http.StatusOK, gin.H{ + "delegations": out, + "count": len(out), + "status": statusKey, + "limit": limit, + }) +} + +// Stats handles GET /admin/delegations/stats — at-a-glance counts per +// status. Useful for the dashboard summary card at the top of the +// operator panel without paying for a row-level fetch. +// +// Returns 200 with `{"queued": N, "dispatched": N, "in_progress": N, +// "completed": N, "failed": N, "stuck": N}`. +func (h *AdminDelegationsHandler) Stats(c *gin.Context) { + rows, err := h.db.QueryContext(c.Request.Context(), ` + SELECT status, COUNT(*) FROM delegations GROUP BY status + `) + if err != nil { + log.Printf("AdminDelegations.Stats: query failed: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "query failed"}) + return + } + defer rows.Close() + + // Initialise to zero so the response always has every known status + // key — the dashboard card doesn't need to handle "missing key vs + // zero" branching. + stats := map[string]int{ + "queued": 0, + "dispatched": 0, + "in_progress": 0, + "completed": 0, + "failed": 0, + "stuck": 0, + } + for rows.Next() { + var status string + var count int + if err := rows.Scan(&status, &count); err != nil { + log.Printf("AdminDelegations.Stats: scan failed: %v", err) + continue + } + stats[status] = count + } + if err := rows.Err(); err != nil { + log.Printf("AdminDelegations.Stats: rows.Err: %v", err) + } + + c.JSON(http.StatusOK, stats) +} diff --git a/workspace-server/internal/handlers/admin_delegations_test.go b/workspace-server/internal/handlers/admin_delegations_test.go new file mode 100644 index 00000000..8fb2cb3d --- /dev/null +++ b/workspace-server/internal/handlers/admin_delegations_test.go @@ -0,0 +1,332 @@ +package handlers + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// admin_delegations_test.go — RFC #2829 PR-4 dashboard endpoint coverage. +// +// - List: status filter + limit defaults + bad-input rejection +// - Stats: per-status counts + zero-fill for missing statuses + +// ---------- List ---------- + +func TestAdminDelegations_List_DefaultStatusInFlight(t *testing.T) { + mock := setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + now := time.Now() + mock.ExpectQuery(`SELECT delegation_id, caller_id::text, callee_id::text, task_preview,\s+status, last_heartbeat, deadline, result_preview, error_detail,\s+retry_count, created_at, updated_at\s+FROM delegations\s+WHERE status IN \(\$1,\$2,\$3\)\s+ORDER BY created_at DESC\s+LIMIT \$4`). + WithArgs("queued", "dispatched", "in_progress", 100). + WillReturnRows(sqlmock.NewRows([]string{ + "delegation_id", "caller_id", "callee_id", "task_preview", + "status", "last_heartbeat", "deadline", "result_preview", "error_detail", + "retry_count", "created_at", "updated_at", + }).AddRow( + "deleg-1", "caller-uuid", "callee-uuid", "task body", + "in_progress", now, now.Add(2*time.Hour), nil, nil, + 0, now.Add(-5*time.Minute), now.Add(-1*time.Minute), + )) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations", nil) + h.List(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + var body map[string]any + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("body parse: %v", err) + } + if got := body["count"]; got != float64(1) { + t.Errorf("count: expected 1, got %v", got) + } + if got := body["status"]; got != "in_flight" { + t.Errorf("status: expected in_flight, got %v", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestAdminDelegations_List_StatusStuck(t *testing.T) { + mock := setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + mock.ExpectQuery(`SELECT delegation_id`). + WithArgs("stuck", 100). + WillReturnRows(sqlmock.NewRows([]string{ + "delegation_id", "caller_id", "callee_id", "task_preview", + "status", "last_heartbeat", "deadline", "result_preview", "error_detail", + "retry_count", "created_at", "updated_at", + })) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations?status=stuck", nil) + h.List(c) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestAdminDelegations_List_StatusFailed(t *testing.T) { + mock := setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + mock.ExpectQuery(`SELECT delegation_id`). + WithArgs("failed", 100). + WillReturnRows(sqlmock.NewRows([]string{ + "delegation_id", "caller_id", "callee_id", "task_preview", + "status", "last_heartbeat", "deadline", "result_preview", "error_detail", + "retry_count", "created_at", "updated_at", + })) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations?status=failed", nil) + h.List(c) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestAdminDelegations_List_RejectsUnknownStatus(t *testing.T) { + setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations?status=garbage", nil) + h.List(c) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestAdminDelegations_List_RejectsNegativeLimit(t *testing.T) { + setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations?limit=-5", nil) + h.List(c) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestAdminDelegations_List_RejectsLimitOverCap(t *testing.T) { + setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations?limit=99999", nil) + h.List(c) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestAdminDelegations_List_AcceptsCustomLimit(t *testing.T) { + mock := setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + mock.ExpectQuery(`SELECT delegation_id`). + WithArgs("queued", "dispatched", "in_progress", 25). + WillReturnRows(sqlmock.NewRows([]string{ + "delegation_id", "caller_id", "callee_id", "task_preview", + "status", "last_heartbeat", "deadline", "result_preview", "error_detail", + "retry_count", "created_at", "updated_at", + })) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations?limit=25", nil) + h.List(c) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + var body map[string]any + _ = json.Unmarshal(w.Body.Bytes(), &body) + if body["limit"] != float64(25) { + t.Errorf("expected limit=25 echo, got %v", body["limit"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestAdminDelegations_List_PopulatesNullableFields(t *testing.T) { + mock := setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + now := time.Now() + resultStr := "all done" + mock.ExpectQuery(`SELECT delegation_id`). + WithArgs("completed", 100). + WillReturnRows(sqlmock.NewRows([]string{ + "delegation_id", "caller_id", "callee_id", "task_preview", + "status", "last_heartbeat", "deadline", "result_preview", "error_detail", + "retry_count", "created_at", "updated_at", + }).AddRow( + "deleg-2", "c", "ca", "t", + "completed", now, now.Add(2*time.Hour), resultStr, nil, + 0, now, now, + )) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations?status=completed", nil) + h.List(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var body struct { + Delegations []struct { + ResultPreview *string `json:"result_preview"` + ErrorDetail *string `json:"error_detail"` + LastHeartbeat *string `json:"last_heartbeat"` + } `json:"delegations"` + } + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("parse: %v", err) + } + if len(body.Delegations) != 1 { + t.Fatalf("expected 1 row, got %d", len(body.Delegations)) + } + row := body.Delegations[0] + if row.ResultPreview == nil || *row.ResultPreview != "all done" { + t.Errorf("result_preview not populated correctly: %+v", row.ResultPreview) + } + if row.ErrorDetail != nil { + t.Errorf("error_detail should be nil for completed-no-error: %+v", row.ErrorDetail) + } + if row.LastHeartbeat == nil { + t.Errorf("last_heartbeat should be present (non-NULL); got nil") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +// ---------- Stats ---------- + +func TestAdminDelegations_Stats_ZeroFillsMissingStatuses(t *testing.T) { + // Stats response must always include every status key. If no rows + // exist for status='stuck', the response still shows "stuck": 0. + mock := setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + mock.ExpectQuery(`SELECT status, COUNT\(\*\) FROM delegations GROUP BY status`). + WillReturnRows(sqlmock.NewRows([]string{"status", "count"}). + AddRow("in_progress", 7). + AddRow("completed", 130)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations/stats", nil) + h.Stats(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var stats map[string]int + if err := json.Unmarshal(w.Body.Bytes(), &stats); err != nil { + t.Fatalf("parse: %v", err) + } + + expectedKeys := []string{"queued", "dispatched", "in_progress", "completed", "failed", "stuck"} + for _, k := range expectedKeys { + if _, ok := stats[k]; !ok { + t.Errorf("stats missing key %q (zero-fill contract broken)", k) + } + } + if stats["in_progress"] != 7 { + t.Errorf("in_progress count: expected 7, got %d", stats["in_progress"]) + } + if stats["completed"] != 130 { + t.Errorf("completed count: expected 130, got %d", stats["completed"]) + } + if stats["stuck"] != 0 { + t.Errorf("stuck must be zero-filled: got %d", stats["stuck"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestAdminDelegations_Stats_EmptyTable(t *testing.T) { + mock := setupTestDB(t) + h := NewAdminDelegationsHandler(nil) + + mock.ExpectQuery(`SELECT status, COUNT\(\*\) FROM delegations GROUP BY status`). + WillReturnRows(sqlmock.NewRows([]string{"status", "count"})) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/admin/delegations/stats", nil) + h.Stats(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var stats map[string]int + _ = json.Unmarshal(w.Body.Bytes(), &stats) + for k, v := range stats { + if v != 0 { + t.Errorf("empty table → all counts zero; %s=%d", k, v) + } + } +} + +// statusFilters is a contract surface — every key here is documented in +// the endpoint comment + accepted by the validator. Pin it. +func TestStatusFiltersTableShape(t *testing.T) { + expected := map[string][]string{ + "in_flight": {"queued", "dispatched", "in_progress"}, + "stuck": {"stuck"}, + "failed": {"failed"}, + "completed": {"completed"}, + } + for k, want := range expected { + got, ok := statusFilters[k] + if !ok { + t.Errorf("statusFilters missing key %q", k) + continue + } + if len(got) != len(want) { + t.Errorf("statusFilters[%q]: want %v, got %v", k, want, got) + continue + } + for i := range want { + if got[i] != want[i] { + t.Errorf("statusFilters[%q][%d]: want %q, got %q", k, i, want[i], got[i]) + } + } + } +} From aec0fb35d25e414cf48d7dce6a46a075456612c3 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 21:00:47 -0700 Subject: [PATCH 13/28] feat(memories): PATCH /workspaces/:id/memories/:id endpoint for edits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix the only writes to agent_memories were Commit (POST) and Delete (DELETE). Editing an entry meant delete + recreate, losing the original id and created_at, and (the user-visible reason for filing this) leaving the canvas Memory tab without an Edit button at all. Adds PATCH that accepts either content, namespace, or both — at least one required (empty body 400s; silently no-op'ing would let a buggy client think it succeeded). The full Commit security pipeline is re-run on content edits: - redactSecrets on every scope (#1201 SAFE-T) - GLOBAL [MEMORY → [_MEMORY delimiter escape (#807 SAFE-T) - GLOBAL audit log row mirroring Commit's #767 forensic pattern - re-embed via the configured EmbeddingFunc (skipping would leave the row's vector pointing at the OLD content, silently breaking semantic search) Cross-scope edits (LOCAL→GLOBAL) intentionally NOT supported — that's delete + recreate so the GLOBAL access-control gate (only root workspaces can write GLOBAL) gets re-evaluated cleanly. 7 new sqlmock tests pin: namespace-only, content-only LOCAL, content-only GLOBAL with audit + escape, empty-body 400, empty- content 400, 404 on missing/wrong-workspace memory, no-op 200 with changed=false (and crucially: no UPDATE fires on no-op). Build clean, full handlers test suite (./internal/handlers) passes in 4s. PR-2 (frontend): Add modal + Edit button in MemoryInspectorPanel.tsx will land separately. --- .../internal/handlers/memories.go | 171 ++++++++++++++ .../internal/handlers/memories_test.go | 217 +++++++++++++++++- workspace-server/internal/router/router.go | 1 + 3 files changed, 388 insertions(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/memories.go b/workspace-server/internal/handlers/memories.go index 3efff1dc..137bab90 100644 --- a/workspace-server/internal/handlers/memories.go +++ b/workspace-server/internal/handlers/memories.go @@ -475,6 +475,177 @@ func (h *MemoriesHandler) Search(c *gin.Context) { c.JSON(http.StatusOK, memories) } +// Update handles PATCH /workspaces/:id/memories/:memoryId +// +// Edits an existing semantic-memory row's content and/or namespace. +// Both body fields are optional; at least one must be present (a body +// with neither returns 400 — there's nothing to do, and silently +// no-op'ing would let a buggy client think it had succeeded). +// +// Content edits re-run the same security pipeline as Commit: secret +// redaction (#1201) on every scope, plus delimiter-spoofing escape on +// GLOBAL. Skipping either when content changes would mean an Edit +// becomes a back-door past the policies a Commit enforces. The same +// re-embedding rule applies — a stale embedding for the new content +// would silently break semantic search. GLOBAL audit log fires on +// content change so the forensic trail captures edits, not just +// initial writes. +// +// Namespace edits are validated against the same 50-char ceiling +// Commit uses; cross-scope changes (e.g. LOCAL→GLOBAL) are NOT +// supported here — that's a delete + recreate so the GLOBAL +// access-control gate (only root workspaces can write GLOBAL) gets +// re-evaluated from scratch. +// +// Returns 200 with the updated row's id+scope+namespace on success, +// 400 on bad body, 404 when the memory doesn't exist or isn't owned +// by this workspace, 500 on DB failure. +func (h *MemoriesHandler) Update(c *gin.Context) { + workspaceID := c.Param("id") + memoryID := c.Param("memoryId") + ctx := c.Request.Context() + + // json.Decode (not gin's ShouldBindJSON) so we can distinguish + // "field omitted" from "field set to empty string" — content="" is + // invalid; content omitted means "don't change content". + var body struct { + Content *string `json:"content,omitempty"` + Namespace *string `json:"namespace,omitempty"` + } + if err := json.NewDecoder(c.Request.Body).Decode(&body); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"}) + return + } + if body.Content == nil && body.Namespace == nil { + c.JSON(http.StatusBadRequest, gin.H{ + "error": "at least one of content or namespace must be set", + }) + return + } + if body.Content != nil && *body.Content == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "content cannot be empty"}) + return + } + if body.Namespace != nil { + if len(*body.Namespace) == 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "namespace cannot be empty"}) + return + } + if len(*body.Namespace) > 50 { + c.JSON(http.StatusBadRequest, gin.H{"error": "namespace must be <= 50 characters"}) + return + } + } + + // Fetch current row to discover the scope (we need it for the + // GLOBAL delimiter-escape + audit log) and to confirm ownership. + // One round-trip rather than two: SELECT ... WHERE id AND + // workspace_id covers the 404 path without an extra existence check. + var existingScope, existingContent, existingNamespace string + if err := db.DB.QueryRowContext(ctx, ` + SELECT scope, content, namespace + FROM agent_memories + WHERE id = $1 AND workspace_id = $2 + `, memoryID, workspaceID).Scan(&existingScope, &existingContent, &existingNamespace); err != nil { + // sql.ErrNoRows or any other read failure — both surface as 404 + // to avoid leaking row existence across workspaces. + c.JSON(http.StatusNotFound, gin.H{"error": "memory not found or not owned by this workspace"}) + return + } + + // Compute the new content (post-redaction, post-delimiter-escape) + // only when content is actually changing. This keeps namespace-only + // edits cheap (no embed call, no audit row). + newContent := existingContent + contentChanged := false + if body.Content != nil && *body.Content != existingContent { + c2 := *body.Content + c2, _ = redactSecrets(workspaceID, c2) + if existingScope == "GLOBAL" { + c2 = strings.ReplaceAll(c2, "[MEMORY ", "[_MEMORY ") + } + if c2 != existingContent { + newContent = c2 + contentChanged = true + } + } + + newNamespace := existingNamespace + if body.Namespace != nil && *body.Namespace != existingNamespace { + newNamespace = *body.Namespace + } + + if !contentChanged && newNamespace == existingNamespace { + // Nothing to do post-normalisation (e.g. caller passed the + // SAME content + namespace). Return the existing shape so the + // caller's response-handling can stay uniform with the change + // path — silently no-op would force every client to special- + // case 204. + c.JSON(http.StatusOK, gin.H{ + "id": memoryID, "scope": existingScope, "namespace": existingNamespace, + "changed": false, + }) + return + } + + if _, err := db.DB.ExecContext(ctx, ` + UPDATE agent_memories + SET content = $1, namespace = $2, updated_at = now() + WHERE id = $3 AND workspace_id = $4 + `, newContent, newNamespace, memoryID, workspaceID); err != nil { + log.Printf("Update memory error: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update memory"}) + return + } + + // GLOBAL content edits write an audit row mirroring Commit's #767 + // pattern. Namespace-only edits don't get an audit entry — the + // content (and its sha256) is unchanged, so there's nothing new + // for forensic replay to capture. + if existingScope == "GLOBAL" && contentChanged { + sum := sha256.Sum256([]byte(newContent)) + auditBody, _ := json.Marshal(map[string]string{ + "memory_id": memoryID, + "namespace": newNamespace, + "content_sha256": hex.EncodeToString(sum[:]), + "reason": "edited", + }) + summary := "GLOBAL memory edited: id=" + memoryID + " namespace=" + newNamespace + if _, auditErr := db.DB.ExecContext(ctx, ` + INSERT INTO activity_logs (workspace_id, activity_type, source_id, summary, request_body, status) + VALUES ($1, $2, $3, $4, $5::jsonb, $6) + `, workspaceID, "memory_edit_global", workspaceID, summary, string(auditBody), "ok"); auditErr != nil { + log.Printf("Update: GLOBAL memory audit log failed for %s/%s: %v", workspaceID, memoryID, auditErr) + } + } + + // Re-embed when content changed. Same non-fatal pattern as Commit: + // a failed embed leaves the row with its OLD vector (or no vector + // if the original Commit's embed also failed). Future Search calls + // fall through to FTS for this row. + if contentChanged && h.embed != nil { + if vec, embedErr := h.embed(ctx, newContent); embedErr != nil { + log.Printf("Update: embedding failed workspace=%s memory=%s: %v (kept stale embedding)", + workspaceID, memoryID, embedErr) + } else if fmtVec := formatVector(vec); fmtVec != "" { + if _, updateErr := db.DB.ExecContext(ctx, + `UPDATE agent_memories SET embedding = $1::vector WHERE id = $2`, + fmtVec, memoryID, + ); updateErr != nil { + log.Printf("Update: embedding UPDATE failed workspace=%s memory=%s: %v", + workspaceID, memoryID, updateErr) + } + } + } + + c.JSON(http.StatusOK, gin.H{ + "id": memoryID, + "scope": existingScope, + "namespace": newNamespace, + "changed": true, + }) +} + // Delete handles DELETE /workspaces/:id/memories/:memoryId func (h *MemoriesHandler) Delete(c *gin.Context) { workspaceID := c.Param("id") diff --git a/workspace-server/internal/handlers/memories_test.go b/workspace-server/internal/handlers/memories_test.go index 8b9bcedf..5fbf6db5 100644 --- a/workspace-server/internal/handlers/memories_test.go +++ b/workspace-server/internal/handlers/memories_test.go @@ -1083,4 +1083,219 @@ func TestCommitMemory_LocalScope_NoDelimiterEscape(t *testing.T) { if err := mock.ExpectationsWereMet(); err != nil { t.Errorf("LOCAL memory content should be stored verbatim: %v", err) } -} \ No newline at end of file +} +// ---------- MemoriesHandler: Update (PATCH) ---------- +// +// Pin the full Update flow: namespace-only edit, content edit (LOCAL), +// content edit (GLOBAL with audit + delimiter escape), no-op edit, and +// the 400 / 404 paths. Matches the security pipeline of Commit so an +// edit can't become a back-door past the policies a write enforces. + +func TestMemoriesUpdate_NamespaceOnly_Success(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewMemoriesHandler() + + mock.ExpectQuery("SELECT scope, content, namespace"). + WithArgs("mem-1", "ws-1"). + WillReturnRows(sqlmock.NewRows([]string{"scope", "content", "namespace"}). + AddRow("LOCAL", "old content", "general")) + mock.ExpectExec("UPDATE agent_memories"). + WithArgs("old content", "facts", "mem-1", "ws-1"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}, {Key: "memoryId", Value: "mem-1"}} + c.Request = httptest.NewRequest("PATCH", "/", bytes.NewBufferString(`{"namespace":"facts"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Update(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["namespace"] != "facts" { + t.Errorf("expected namespace=facts, got %v", resp["namespace"]) + } + if resp["changed"] != true { + t.Errorf("expected changed=true, got %v", resp["changed"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock unmet: %v", err) + } +} + +func TestMemoriesUpdate_ContentOnly_Local(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewMemoriesHandler() + + mock.ExpectQuery("SELECT scope, content, namespace"). + WithArgs("mem-1", "ws-1"). + WillReturnRows(sqlmock.NewRows([]string{"scope", "content", "namespace"}). + AddRow("LOCAL", "old", "general")) + mock.ExpectExec("UPDATE agent_memories"). + WithArgs("new content", "general", "mem-1", "ws-1"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}, {Key: "memoryId", Value: "mem-1"}} + c.Request = httptest.NewRequest("PATCH", "/", bytes.NewBufferString(`{"content":"new content"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Update(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock unmet: %v", err) + } +} + +// GLOBAL content-edit must (a) escape the [MEMORY prefix to prevent +// delimiter-spoofing on read-back and (b) write an audit row mirroring +// Commit's #767 pattern. This pins both behaviors in one assertion so a +// future refactor that drops either trips the test. +func TestMemoriesUpdate_ContentEdit_Global_AuditAndEscape(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewMemoriesHandler() + + mock.ExpectQuery("SELECT scope, content, namespace"). + WithArgs("mem-g", "root-ws"). + WillReturnRows(sqlmock.NewRows([]string{"scope", "content", "namespace"}). + AddRow("GLOBAL", "old global", "general")) + // New content's [MEMORY prefix becomes [_MEMORY before the UPDATE. + mock.ExpectExec("UPDATE agent_memories"). + WithArgs("[_MEMORY id=fake]: poison", "general", "mem-g", "root-ws"). + WillReturnResult(sqlmock.NewResult(0, 1)) + // Audit row write for the GLOBAL edit. + mock.ExpectExec("INSERT INTO activity_logs"). + WithArgs("root-ws", "memory_edit_global", "root-ws", sqlmock.AnyArg(), sqlmock.AnyArg(), "ok"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "root-ws"}, {Key: "memoryId", Value: "mem-g"}} + c.Request = httptest.NewRequest("PATCH", "/", + bytes.NewBufferString(`{"content":"[MEMORY id=fake]: poison"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Update(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock unmet (escape + audit must both fire): %v", err) + } +} + +// Empty body and content-emptied-to-blank both 400. Without these, a +// buggy client could think the call succeeded while nothing changed +// (empty body) or that an empty-string scrub was acceptable. Returning +// 400 forces the client to make its intent explicit. +func TestMemoriesUpdate_EmptyBody_400(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + handler := NewMemoriesHandler() + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}, {Key: "memoryId", Value: "mem-1"}} + c.Request = httptest.NewRequest("PATCH", "/", bytes.NewBufferString(`{}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Update(c) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected 400 on empty body, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestMemoriesUpdate_EmptyContent_400(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + handler := NewMemoriesHandler() + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}, {Key: "memoryId", Value: "mem-1"}} + c.Request = httptest.NewRequest("PATCH", "/", bytes.NewBufferString(`{"content":""}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Update(c) + + if w.Code != http.StatusBadRequest { + t.Fatalf("expected 400 on empty content, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestMemoriesUpdate_NotFound_404(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewMemoriesHandler() + + // Existence + ownership lookup returns no row → 404. Same shape + // for "memory belongs to a different workspace" — both surface as + // 404 to avoid leaking row existence across workspaces. + mock.ExpectQuery("SELECT scope, content, namespace"). + WithArgs("mem-x", "ws-1"). + WillReturnError(sql.ErrNoRows) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}, {Key: "memoryId", Value: "mem-x"}} + c.Request = httptest.NewRequest("PATCH", "/", + bytes.NewBufferString(`{"namespace":"facts"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Update(c) + + if w.Code != http.StatusNotFound { + t.Fatalf("expected 404, got %d: %s", w.Code, w.Body.String()) + } +} + +// Caller passes content + namespace identical to existing values: +// post-normalisation nothing changed. Return 200 with changed=false, +// no UPDATE, no audit row. Saves a round-trip + an audit-log entry on +// idempotent re-edits (e.g. user clicks Save without changing fields). +func TestMemoriesUpdate_NoOp_NoUpdate(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewMemoriesHandler() + + mock.ExpectQuery("SELECT scope, content, namespace"). + WithArgs("mem-1", "ws-1"). + WillReturnRows(sqlmock.NewRows([]string{"scope", "content", "namespace"}). + AddRow("LOCAL", "same", "general")) + // No UPDATE expectation — sqlmock will fail ExpectationsWereMet + // if one fires. + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}, {Key: "memoryId", Value: "mem-1"}} + c.Request = httptest.NewRequest("PATCH", "/", + bytes.NewBufferString(`{"content":"same","namespace":"general"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Update(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200 on no-op, got %d: %s", w.Code, w.Body.String()) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["changed"] != false { + t.Errorf("expected changed=false on no-op, got %v", resp["changed"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("UPDATE must not fire on no-op: %v", err) + } +} diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index bd542e56..1afff092 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -217,6 +217,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi wsAuth.POST("/memories", memsh.Commit) wsAuth.GET("/memories", memsh.Search) wsAuth.DELETE("/memories/:memoryId", memsh.Delete) + wsAuth.PATCH("/memories/:memoryId", memsh.Update) // Approvals apph := handlers.NewApprovalsHandler(broadcaster) From 56149f8a24da2aa858df9de0f5c8dae9f7fb99b5 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 21:08:08 -0700 Subject: [PATCH 14/28] fix(bundle): markFailed sets last_sample_error + AST gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the bug class surfaced by Canvas E2E #2632: a workspace ends up status='failed' with last_sample_error=NULL, and operators (or the E2E poll loop) see the useless "Workspace failed: (no last_sample_error)" with no triage signal. Two pieces: 1. **bundle/importer.go markFailed** — the UPDATE was setting only status, leaving last_sample_error NULL. Same incident class as the silent-drop bugs in PRs #2811 + #2824, different code path. markProvisionFailed in workspace_provision_shared.go has set the message column for a long time; this writer drifted the convention. Fix: include last_sample_error in the SET clause + the broadcast. 2. **AST drift gate** (db/workspace_status_failed_message_drift_test.go) — Go AST walk that finds every db.DB.{Exec,Query,QueryRow}Context call whose argument list binds models.StatusFailed and asserts the SQL literal contains last_sample_error. Catches the next caller that drifts the same convention. Verified to FAIL against the bug shape (reverted importer.go temporarily — gate flagged the exact line) and PASS against the fix. Why an AST gate vs a regex: pre-fix attempt with a regex over UPDATE statements flagged status='online' / status='hibernating' / status= 'removed' UPDATEs as false positives. Walking the AST and only flagging calls that pass the StatusFailed constant eliminates that. Out of scope (filed separately if needed): - The Canvas E2E that surfaced the missing message (#2632) is now a required check on staging via PR #2827. Once this fix lands the next staging push should re-run #2632's failing case and produce a meaningful last_sample_error. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/internal/bundle/importer.go | 14 +- ...kspace_status_failed_message_drift_test.go | 175 ++++++++++++++++++ 2 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 workspace-server/internal/db/workspace_status_failed_message_drift_test.go diff --git a/workspace-server/internal/bundle/importer.go b/workspace-server/internal/bundle/importer.go index e12dfda8..43ec618c 100644 --- a/workspace-server/internal/bundle/importer.go +++ b/workspace-server/internal/bundle/importer.go @@ -131,11 +131,19 @@ func buildBundleConfigFiles(b *Bundle) map[string][]byte { } func markFailed(ctx context.Context, wsID string, broadcaster *events.Broadcaster, err error) { + // Set last_sample_error along with status so operators (and the + // Canvas E2E + GET /workspaces/:id callers) get a non-null reason + // in the row. Pre-2026-05-05 this UPDATE only set status, leaving + // last_sample_error NULL — Canvas E2E #2632 surfaced the gap with + // `Workspace failed: (no last_sample_error)`. Same UPDATE shape as + // markProvisionFailed in workspace-server/internal/handlers/ + // workspace_provision_shared.go. + msg := err.Error() db.DB.ExecContext(ctx, - `UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2`, - models.StatusFailed, wsID) + `UPDATE workspaces SET status = $1, last_sample_error = $2, updated_at = now() WHERE id = $3`, + models.StatusFailed, msg, wsID) broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", wsID, map[string]interface{}{ - "error": err.Error(), + "error": msg, }) } diff --git a/workspace-server/internal/db/workspace_status_failed_message_drift_test.go b/workspace-server/internal/db/workspace_status_failed_message_drift_test.go new file mode 100644 index 00000000..872903ad --- /dev/null +++ b/workspace-server/internal/db/workspace_status_failed_message_drift_test.go @@ -0,0 +1,175 @@ +package db_test + +// Static drift gate: every UPDATE that sets status to a "failed" value +// must also set last_sample_error in the same statement. Otherwise the +// row ends up with status='failed' + last_sample_error=NULL — operators +// see "workspace failed" with no reason, and the Canvas E2E reports the +// useless `Workspace failed: (no last_sample_error)` from #2632. +// +// Why a static gate: pre-2026-05-05 we had at least two writers +// (markProvisionFailed in workspace_provision_shared.go set the +// message; bundle/importer.go's markFailed didn't). The provision- +// timeout sweep also sets the message. Code review missed the +// importer drift for ~6 months until the Canvas E2E surfaced it. +// +// Rule: +// - If a Go string literal in this repo contains both +// `UPDATE workspaces` and a clause setting `status` to a value +// resembling "failed" — either via a `$N` placeholder later bound +// to StatusFailed, or via an inline `'failed'` literal — that same +// literal MUST also contain `last_sample_error`. +// - Allowed: an UPDATE that only sets status to a non-failed value +// (online, hibernating, removed, etc.). Those don't need the +// message column, and clearing it would lose forensic context. +// +// Caveats: +// - The test reads source as text. Multi-line UPDATEs split across +// concatenated string fragments will slip past — that's an +// accepted limitation for now; the parameterized-write refactor +// (#2799) will let us replace this textual gate with a typed-call +// gate eventually. +// - "last_sample_error" appearing anywhere in the same literal is +// enough to satisfy the rule. We don't try to verify the column +// receives a non-empty value at runtime — that's the +// parameterized-write refactor's territory too. + +import ( + "fmt" + "go/ast" + "go/parser" + "go/token" + "os" + "path/filepath" + "regexp" + "strings" + "testing" +) + +// TestWorkspaceStatusFailed_MustSetLastSampleError uses Go's AST to find +// every ExecContext call whose argument list includes the +// `models.StatusFailed` constant. For each such call, the SQL literal +// (the second argument) must also contain `last_sample_error`. This +// catches the bug class without false-positive matches on UPDATEs that +// set status to a non-failed value (online/hibernating/removed/etc.) +// because those don't pass StatusFailed as an arg. +func TestWorkspaceStatusFailed_MustSetLastSampleError(t *testing.T) { + root := findRepoRoot(t) + violations := []string{} + + walkErr := filepath.Walk(filepath.Join(root, "workspace-server", "internal"), func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + if filepath.Ext(path) != ".go" { + return nil + } + if strings.HasSuffix(path, "_test.go") { + return nil + } + fset := token.NewFileSet() + f, err := parser.ParseFile(fset, path, nil, parser.SkipObjectResolution) + if err != nil { + return err + } + ast.Inspect(f, func(n ast.Node) bool { + call, ok := n.(*ast.CallExpr) + if !ok { + return true + } + sel, ok := call.Fun.(*ast.SelectorExpr) + if !ok { + return true + } + // Match db.DB.ExecContext / db.DB.QueryContext / db.DB.QueryRowContext + // — the three SQL execution surfaces this codebase uses. + methodName := sel.Sel.Name + if methodName != "ExecContext" && methodName != "QueryContext" && methodName != "QueryRowContext" { + return true + } + // Args: 0=ctx, 1=sql-literal, 2..=bind vars. + if len(call.Args) < 3 { + return true + } + passesStatusFailed := false + for _, a := range call.Args[2:] { + if isStatusFailedRef(a) { + passesStatusFailed = true + break + } + } + if !passesStatusFailed { + return true + } + // SQL literal — usually `*ast.BasicLit` for a single-line + // string or a back-tick string. May also be a const ref. + sqlText := extractStringLit(call.Args[1]) + if sqlText == "" { + // SQL is a name reference, not a literal — can't check. + return true + } + if strings.Contains(sqlText, "last_sample_error") { + return true + } + // Skip non-UPDATE statements that happen to pass StatusFailed + // (e.g. SELECT … WHERE status = $1). The drift target is + // specifically writes that mark the row failed. + if !regexp.MustCompile(`(?i)\bUPDATE\s+workspaces\b`).MatchString(sqlText) { + return true + } + rel, _ := filepath.Rel(root, path) + pos := fset.Position(call.Pos()) + snippet := strings.TrimSpace(sqlText) + if len(snippet) > 120 { + snippet = snippet[:120] + "..." + } + violations = append(violations, + fmt.Sprintf("%s:%d: %s", rel, pos.Line, snippet)) + return true + }) + return nil + }) + if walkErr != nil { + t.Fatalf("walk: %v", walkErr) + } + + if len(violations) > 0 { + t.Errorf("UPDATE workspaces SET status = ... binds models.StatusFailed but the SQL literal does not write last_sample_error — every code path that marks a workspace failed must also write the reason, or operators see `Workspace failed: (no last_sample_error)` (incident: Canvas E2E #2632). Add `, last_sample_error = $N` to the SET clause.\n\nViolations:\n - %s", + strings.Join(violations, "\n - ")) + } +} + +// isStatusFailedRef returns true if expr resolves to models.StatusFailed +// (selector StatusFailed off the models package). Catches both +// `models.StatusFailed` directly and `models.StatusFailed.String()` +// style usages — anything that names the constant. +func isStatusFailedRef(expr ast.Expr) bool { + if sel, ok := expr.(*ast.SelectorExpr); ok { + if sel.Sel.Name == "StatusFailed" { + return true + } + } + return false +} + +// extractStringLit returns the unquoted contents of a string literal +// expression, or "" if expr is not a literal we can read statically +// (e.g. concatenation, function-call argument, named const reference). +func extractStringLit(expr ast.Expr) string { + lit, ok := expr.(*ast.BasicLit) + if !ok || lit.Kind != token.STRING { + return "" + } + val := lit.Value + if len(val) >= 2 { + first, last := val[0], val[len(val)-1] + if (first == '`' && last == '`') || (first == '"' && last == '"') { + return val[1 : len(val)-1] + } + } + return val +} + + From cbe48c22252664a13b62e7e0d7f63342d16a7921 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 21:16:35 -0700 Subject: [PATCH 15/28] feat(canvas/memories): Add + Edit modal for MemoryInspectorPanel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Memory tab was read-only — users could see and Delete entries but the only path to write was leaving canvas. Adds a + Add button (toolbar, next to Refresh) and an Edit button (per-entry, next to Delete) that share one MemoryEditorDialog. Add: POST /workspaces/:id/memories with {content, scope, namespace} Edit: PATCH /workspaces/:id/memories/:id (sibling endpoint #2838) with only fields that changed; no-op edits short-circuit client-side so we don't waste a redactSecrets + re-embed pass Edit mode locks scope (cross-scope moves go through delete + recreate to keep the GLOBAL audit-log + redact pipeline single-purpose). Tests: 6 cases on the dialog covering POST shape, PATCH-only-diff, no-op short-circuit, empty-content guard, save-error keeps modal open, and namespace+content combined PATCH. Existing 27 MemoryInspectorPanel tests still pass with the new prop wiring. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/MemoryEditorDialog.tsx | 261 ++++++++++++++++++ .../src/components/MemoryInspectorPanel.tsx | 96 +++++-- .../__tests__/MemoryEditorDialog.test.tsx | 202 ++++++++++++++ 3 files changed, 539 insertions(+), 20 deletions(-) create mode 100644 canvas/src/components/MemoryEditorDialog.tsx create mode 100644 canvas/src/components/__tests__/MemoryEditorDialog.test.tsx diff --git a/canvas/src/components/MemoryEditorDialog.tsx b/canvas/src/components/MemoryEditorDialog.tsx new file mode 100644 index 00000000..6625412a --- /dev/null +++ b/canvas/src/components/MemoryEditorDialog.tsx @@ -0,0 +1,261 @@ +'use client'; + +import { useEffect, useRef, useState } from "react"; +import { createPortal } from "react-dom"; +import { api } from "@/lib/api"; +import type { MemoryEntry } from "@/components/MemoryInspectorPanel"; + +type Scope = "LOCAL" | "TEAM" | "GLOBAL"; +const SCOPES: Scope[] = ["LOCAL", "TEAM", "GLOBAL"]; + +interface AddProps { + open: boolean; + mode: "add"; + workspaceId: string; + defaultScope: Scope; + defaultNamespace?: string; + entry?: undefined; + onClose: () => void; + onSaved: () => void; +} + +interface EditProps { + open: boolean; + mode: "edit"; + workspaceId: string; + entry: MemoryEntry; + defaultScope?: undefined; + defaultNamespace?: undefined; + onClose: () => void; + onSaved: () => void; +} + +type Props = AddProps | EditProps; + +export function MemoryEditorDialog(props: Props) { + const { open, mode, workspaceId, onClose, onSaved } = props; + const dialogRef = useRef(null); + const [mounted, setMounted] = useState(false); + const [scope, setScope] = useState("LOCAL"); + const [namespace, setNamespace] = useState("general"); + const [content, setContent] = useState(""); + const [saving, setSaving] = useState(false); + const [error, setError] = useState(null); + + useEffect(() => { + setMounted(true); + }, []); + + // Reset form whenever the dialog opens. + useEffect(() => { + if (!open) return; + setError(null); + setSaving(false); + if (mode === "edit" && props.entry) { + setScope(props.entry.scope); + setNamespace(props.entry.namespace || "general"); + setContent(props.entry.content); + } else if (mode === "add") { + setScope(props.defaultScope); + setNamespace(props.defaultNamespace || "general"); + setContent(""); + } + // mode/props are stable per-open; intentional shallow deps. + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [open]); + + // Move focus into the dialog when it opens (WCAG SC 2.4.3). + useEffect(() => { + if (!open || !mounted) return; + const raf = requestAnimationFrame(() => { + dialogRef.current?.querySelector("textarea, input, select")?.focus(); + }); + return () => cancelAnimationFrame(raf); + }, [open, mounted]); + + // Escape closes; Cmd/Ctrl-Enter saves. + const onCloseRef = useRef(onClose); + onCloseRef.current = onClose; + const handleSaveRef = useRef<() => void>(() => {}); + useEffect(() => { + if (!open) return; + const handler = (e: KeyboardEvent) => { + if (e.key === "Escape") { + e.preventDefault(); + onCloseRef.current(); + } else if (e.key === "Enter" && (e.metaKey || e.ctrlKey)) { + e.preventDefault(); + handleSaveRef.current(); + } + }; + window.addEventListener("keydown", handler); + return () => window.removeEventListener("keydown", handler); + }, [open]); + + const handleSave = async () => { + if (saving) return; + const trimmed = content.trim(); + if (!trimmed) { + setError("Content cannot be empty"); + return; + } + setError(null); + setSaving(true); + try { + if (mode === "add") { + await api.post(`/workspaces/${workspaceId}/memories`, { + content: trimmed, + scope, + namespace: namespace.trim() || "general", + }); + } else { + // PATCH only sends fields that changed. Content always changeable; + // namespace only sent if it differs from the original (saves a + // no-op write through redactSecrets + re-embed). + const original = props.entry; + const body: Record = {}; + if (trimmed !== original.content) body.content = trimmed; + const ns = namespace.trim() || "general"; + if (ns !== original.namespace) body.namespace = ns; + if (Object.keys(body).length === 0) { + // No-op edit — close without an HTTP round-trip. + onSaved(); + onClose(); + return; + } + await api.patch( + `/workspaces/${workspaceId}/memories/${encodeURIComponent(original.id)}`, + body, + ); + } + onSaved(); + onClose(); + } catch (e) { + setError(e instanceof Error ? e.message : "Save failed"); + } finally { + setSaving(false); + } + }; + handleSaveRef.current = handleSave; + + if (!open || !mounted) return null; + + const titleId = "memory-editor-title"; + const isEdit = mode === "edit"; + + return createPortal( +
+
+ +
+
+

+ {isEdit ? "Edit memory" : "Add memory"} +

+ + {/* Scope */} +
+ + {isEdit ? ( +
+ {scope} +
+ ) : ( +
+ {SCOPES.map((s) => ( + + ))} +
+ )} +
+ + {/* Namespace */} +
+ + setNamespace(e.target.value)} + placeholder="general" + className="w-full bg-surface border border-line/60 focus:border-accent/60 rounded px-2 py-1.5 text-[12px] text-ink placeholder-zinc-600 focus:outline-none transition-colors" + /> +
+ + {/* Content */} +
+ +