From e6c78e328a90284f390f83fb0d0a5f9f46633a32 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 03:52:58 +0000 Subject: [PATCH 1/2] fix(a2a-proxy): normalize system-caller source_id to NULL (core#2680 wedge recovery) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-restart wedge (core#2680) wedges a workspace in 'degraded' because the synthetic 'system:restart-context' caller ID gets persisted verbatim into activity_logs.source_id (UUID-typed). Two compounding effects: 1. The source_id column accepts the raw string at write time (TEXT-coerced via the JSONB request_body), but every downstream JOIN (e.g. activity_logs.source_id = workspaces.id) silently misses — the row is unreadable for the canvas /activity feed filter that keys on source_id IS NULL. 2. The queue-fallback path (a2a_queue.caller_id) that lets a workspace recover from the wedge depends on LogActivity's request_body surviving a re-read by messageId. The poisoned source_id also breaks the partial-unique index lookup used by the recovery path. Fix: extend nilIfEmpty (a2a_proxy_helpers.go:441) to also return nil for any system-caller prefix (matching isSystemCaller in a2a_proxy.go:85). All 5 call sites — LogActivity invocations at lines 319, 347, 420, 835, 937 — pick up the new behavior without a per-site change. The 'system caller' semantic is preserved via source_id IS NULL (the existing canvas /activity?source=canvas filter already keys on that). Mirrors the queue-side fix in #2696 (a2a_queue.caller_id): BOTH persisted-caller columns follow the same isSystemCaller() → NULL normalization. Single source of truth in a2a_proxy.go:82-91. No drift surface. Tests: - TestNilIfEmpty_SystemCallerPrefixes: 4 cases covering all systemCallerPrefixes (system:, webhook:, test:, channel:). - TestNilIfEmpty_RealWorkspaceUUIDStillPreserved: 4 cases (op-style id, full UUID, agent id, canvas_user placeholder) to guard against the 'fix collapses real UUIDs' regression that closed the prior #2694 attempt. Verified: go build + go vet clean; TestNilIfEmpty_* all pass (6 cases: 2 original + 4 new). Cross-references #2680 + confirms the fix unblocks the queue-fallback recovery path: the wedge detector (registry.go:950-955) still fires on hasRecentRegisterFailure (the auth-token rotation is a separate #2530 fix), but the recovery path can now read the durable row by message_id without tripping the source_id-cast failure. Out of scope: #2530 (auth-token rotation on container re-create) — still tracked separately. This PR is the data-side companion; together they fully close the post-restart wedge. Refs: #2680, #2693, #2696 (queue side). --- .../internal/handlers/a2a_proxy_helpers.go | 30 ++++++++++- .../handlers/a2a_proxy_helpers_test.go | 51 +++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index 8a13cc30..5a41d11f 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -438,8 +438,36 @@ func (h *WorkspaceHandler) logA2ASuccess(ctx context.Context, workspaceID, calle } } +// nilIfEmpty returns nil for an empty string OR a system-caller +// identifier (matching isSystemCaller in a2a_proxy.go:85). System +// callers are synthetic IDs like "system:restart-context" that +// carry a non-UUID semantic; persisting them to a UUID-typed +// activity_logs.source_id column trips a cast failure downstream +// and (more importantly) poisons the column with a string that +// doesn't JOIN to workspaces.id, breaking the queue-fallback path +// that lets a workspace recover from the post-restart wedge +// (core#2680). +// +// Origin: #2693 (closes this fix) and the original #2680 RCA. The +// call sites this fixes (a2a_proxy_helpers.go:319, 347, 420, 835, +// 937 — all `SourceID: nilIfEmpty(callerID)` in LogActivity +// invocations) used to write the raw "system:restart-context" +// string into source_id; the post-restart A2A dispatch then +// crashes on the UUID cast AND the recovery path can't find the +// row by source_id. Normalizing to NULL preserves the +// "this row came from a system caller" semantic via source_id +// IS NULL (the existing query path already filters on that — +// see the canvas /activity?source=canvas filter, which key on +// source_id IS NULL). +// +// Idempotent: the change is 1 line (the early-return is extended +// to also fire on isSystemCaller). All 5 call sites pick up the +// new behavior without a per-site change. Mirrors the queue-side +// fix in #2696 (a2a_queue.caller_id) so BOTH persisted-caller +// columns follow the same isSystemCaller() → NULL normalization — +// single source of truth in a2a_proxy.go:84-91. func nilIfEmpty(s string) *string { - if s == "" { + if s == "" || isSystemCaller(s) { return nil } return &s diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers_test.go b/workspace-server/internal/handlers/a2a_proxy_helpers_test.go index b3677cc1..12e8e0a4 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers_test.go @@ -27,6 +27,57 @@ func TestNilIfEmpty_NonEmptyString(t *testing.T) { } } +// System-caller normalization (core#2680, fix/restart-context-callerid-normalize). +// A synthetic caller like "system:restart-context" must not be +// persisted into the UUID-typed activity_logs.source_id column; +// that path is the only one that lets a workspace recover from the +// post-restart wedge. Normalizing to NULL preserves the +// "system caller" semantic via source_id IS NULL (the existing +// canvas /activity?source=canvas filter) and lets the queue-fallback +// path find the row by the durable message_id. + +func TestNilIfEmpty_SystemCallerPrefixes(t *testing.T) { + cases := []string{ + "system:restart-context", + "webhook:github", + "test:lifecycle-1", + "channel:slack:C0123", + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + got := nilIfEmpty(c) + if got != nil { + t.Errorf("system caller %q: got %p (%q), want nil", c, got, *got) + } + }) + } +} + +func TestNilIfEmpty_RealWorkspaceUUIDStillPreserved(t *testing.T) { + // Regression guard: a real workspace UUID must pass through + // unchanged. The original #2694 RC closed because the fix + // accidentally collapsed real UUIDs to NULL; this case is the + // one that would have caught that. + cases := []string{ + "ws-1", // op-style id + "01234567-89ab-cdef-0123-456789abcdef", // uuid + "agent-dev-b", // agent id (not a system prefix) + "canvas_user", // canvas user placeholder + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + got := nilIfEmpty(c) + if got == nil { + t.Errorf("real caller %q: got nil, want preserved pointer", c) + return + } + if *got != c { + t.Errorf("real caller %q: got %q, want preserved", c, *got) + } + }) + } +} + // ───────────────────────────────────────────────────────────────────────────── // extractToolTrace tests // ───────────────────────────────────────────────────────────────────────────── -- 2.52.0 From 20b0c1d0069988d23d815c20afd795662c4d8d98 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 04:25:18 +0000 Subject: [PATCH 2/2] fix(a2a-proxy): scope system-caller normalization to a new callerIDToSourceID helper (RC #11295) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Researcher's RC #11295 on the prior #2701 attempt: the prior fix put the system-caller check inline in nilIfEmpty, but nilIfEmpty is shared across 7+ call sites — only 5 of which are callerID → SourceID (lines 319, 347, 420, 863, 965). The other call sites apply nilIfEmpty to non-ID fields: - activity.go:931 → TargetID (free-form string) - activity.go:932 → Method (free-form method name) - activity.go:933 → Summary (free-form text) - activity.go:938 → ErrorDetail (free-form text) - activity.go:1054 → MessageId (UUID-typed but NOT a caller id) - workspace.go:1062 → workspace_dir (path) A method name like 'system:foo' is a perfectly legitimate value that should NOT be silently nulled. The prior #2701 had a real correctness bug on those 6 callers (any non-empty system-prefixed string would have been collapsed to NULL). Fix: split the helper. - nilIfEmpty (a2a_proxy_helpers.go:441): narrowest possible contract — only handles the empty-string case. Reverts to the pre-#2701 shape; no behavioral change for the 6 non-ID callers. - callerIDToSourceID (a2a_proxy_helpers.go:500, NEW): the system-caller normalization is scoped here. Returns nil for empty callerID OR any isSystemCaller() prefix; returns &callerID otherwise. Called from the 5 LogActivity sites that previously used 'SourceID: nilIfEmpty(callerID)'. Same intent as the prior #2701 (the source_id column is UUID-typed; a literal 'system:restart-context' string poisons it; the recovery path on the post-restart wedge needs NULL so the activity_logs.message_id SELECT finds the durable row); narrower surface; zero collateral on the 6 non-ID callers. Tests: - TestCallerIDToSourceID_SystemCallerPrefixes (4 cases): all 4 systemCallerPrefixes (system:, webhook:, test:, channel:) normalize to nil. - TestCallerIDToSourceID_RealWorkspaceUUIDStillPreserved (4 cases): op-style id, full UUID, agent id, canvas_user all preserved. Regression guard for the bug that closed #2694. - TestCallerIDToSourceID_EmptyString: empty callerID → nil. - TestNilIfEmpty_NoSystemCallerNormalization (3 cases): the guard against the prior #2701's collateral — a string like 'system:foo' as a method name must pass through nilIfEmpty unchanged (the system-caller normalization must NOT leak into non-ID fields). Verified: go build + go vet clean; full handler suite 22.4s all pass (TestNilIfEmpty_*, TestCallerIDToSourceID_*, the original TestNilIfEmpty_EmptyString / TestNilIfEmpty_NonEmptyString all green). Mirrors the queue-side fix in #2696 (a2a_queue.caller_id): BOTH persisted-caller columns follow the same isSystemCaller() → NULL normalization. Single source of truth in a2a_proxy.go:84-91. No drift surface. Refs: #2680, #2693, #2696 (queue side), #11295 (this RC). --- .../internal/handlers/a2a_proxy_helpers.go | 95 ++++++++++++------- .../handlers/a2a_proxy_helpers_test.go | 59 ++++++++++-- 2 files changed, 114 insertions(+), 40 deletions(-) diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index 5a41d11f..115a7b9f 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -316,7 +316,7 @@ func (h *WorkspaceHandler) logA2AFailure(ctx context.Context, workspaceID, calle LogActivity(logCtx, h.broadcaster, ActivityParams{ WorkspaceID: workspaceID, ActivityType: "a2a_receive", - SourceID: nilIfEmpty(callerID), + SourceID: callerIDToSourceID(callerID), TargetID: &workspaceID, Method: &a2aMethod, Summary: &summary, @@ -344,7 +344,7 @@ func (h *WorkspaceHandler) logA2ABusyQueued(ctx context.Context, workspaceID, ca LogActivity(logCtx, h.broadcaster, ActivityParams{ WorkspaceID: workspaceID, ActivityType: "a2a_receive", - SourceID: nilIfEmpty(callerID), + SourceID: callerIDToSourceID(callerID), TargetID: &workspaceID, Method: &a2aMethod, Summary: &summary, @@ -417,7 +417,7 @@ func (h *WorkspaceHandler) logA2ASuccess(ctx context.Context, workspaceID, calle LogActivity(logCtx, h.broadcaster, ActivityParams{ WorkspaceID: workspaceID, ActivityType: "a2a_receive", - SourceID: nilIfEmpty(callerID), + SourceID: callerIDToSourceID(callerID), TargetID: &workspaceID, Method: &a2aMethod, Summary: &summary, @@ -438,41 +438,72 @@ func (h *WorkspaceHandler) logA2ASuccess(ctx context.Context, workspaceID, calle } } -// nilIfEmpty returns nil for an empty string OR a system-caller -// identifier (matching isSystemCaller in a2a_proxy.go:85). System -// callers are synthetic IDs like "system:restart-context" that -// carry a non-UUID semantic; persisting them to a UUID-typed -// activity_logs.source_id column trips a cast failure downstream -// and (more importantly) poisons the column with a string that -// doesn't JOIN to workspaces.id, breaking the queue-fallback path -// that lets a workspace recover from the post-restart wedge -// (core#2680). +// nilIfEmpty returns nil for an empty string. The narrowest possible +// contract: any other input is returned as a pointer to itself. // -// Origin: #2693 (closes this fix) and the original #2680 RCA. The -// call sites this fixes (a2a_proxy_helpers.go:319, 347, 420, 835, -// 937 — all `SourceID: nilIfEmpty(callerID)` in LogActivity -// invocations) used to write the raw "system:restart-context" -// string into source_id; the post-restart A2A dispatch then -// crashes on the UUID cast AND the recovery path can't find the -// row by source_id. Normalizing to NULL preserves the -// "this row came from a system caller" semantic via source_id -// IS NULL (the existing query path already filters on that — -// see the canvas /activity?source=canvas filter, which key on -// source_id IS NULL). +// nilIfEmpty is shared across many call sites — SourceID, TargetID, +// Method, Summary, ErrorDetail, MessageId, workspace_dir — and +// NOT ALL of them are caller identifiers. A system-caller +// normalization (the kind callerIDToSourceID does below) would be +// WRONG applied to a free-form string field like Method +// (the value "system:foo" is a perfectly legitimate method name +// that should NOT be silently nulled). System-caller handling is +// therefore a SEPARATE helper, scoped to the only field that +// actually needs it: the UUID-typed activity_logs.SourceID when +// sourced from a callerID. // -// Idempotent: the change is 1 line (the early-return is extended -// to also fire on isSystemCaller). All 5 call sites pick up the -// new behavior without a per-site change. Mirrors the queue-side -// fix in #2696 (a2a_queue.caller_id) so BOTH persisted-caller -// columns follow the same isSystemCaller() → NULL normalization — -// single source of truth in a2a_proxy.go:84-91. +// Origin: prior #2701 attempt at this fix had the system-caller +// check inline in nilIfEmpty; Researcher's RC #11295 caught the +// too-generic contract (nilIfEmpty is also used on +// method/summary/error-detail/message-id/workspace-dir, none of +// which should be subject to system-caller normalization). The +// scoped helper callerIDToSourceID is the corrected shape: same +// intent, narrower surface, zero collateral on the other 6 callers. func nilIfEmpty(s string) *string { - if s == "" || isSystemCaller(s) { + if s == "" { return nil } return &s } +// callerIDToSourceID normalizes a callerID to *string for use as +// activity_logs.SourceID. The column is UUID-typed; system-caller +// strings like "system:restart-context" would poison the column +// with a non-UUID value, break downstream joins (e.g. the canvas +// /activity?source=canvas filter, which keys on source_id IS NULL), +// and (most importantly for #2680) break the queue-fallback path +// that lets a workspace recover from the post-restart wedge — +// the recovery SELECT on activity_logs.message_id returns the +// durable row, but every consumer of the row would crash on the +// non-UUID source_id. +// +// Returns nil for: +// - empty callerID (no caller — agent initiated, user attribute +// is null) +// - system-caller prefix (matches isSystemCaller in +// a2a_proxy.go:85; preserves the "system caller" semantic via +// source_id IS NULL, the same way the canvas /activity +// filter already keys on) +// +// Returns &callerID for any real workspace UUID or op-style id +// (preserves the row attribution so a per-workspace activity +// filter still works). +// +// Idempotent + side-effect-free. Called from the 5 LogActivity +// sites that previously used `SourceID: nilIfEmpty(callerID)`: +// a2a_proxy_helpers.go:319, 347, 420, 863, 965. +// +// Mirrors the queue-side fix in #2696 (a2a_queue.caller_id) so +// BOTH persisted-caller columns follow the same +// isSystemCaller() → NULL normalization. Single source of truth +// in a2a_proxy.go:84-91. +func callerIDToSourceID(callerID string) *string { + if callerID == "" || isSystemCaller(callerID) { + return nil + } + return &callerID +} + // validateCallerToken enforces the Phase 30.5 auth-token contract on the // caller of an A2A proxy request. Same lazy-bootstrap shape as // registry.requireWorkspaceToken: if the caller workspace has any live @@ -860,7 +891,7 @@ func (h *WorkspaceHandler) logA2AReceiveQueued(ctx context.Context, workspaceID, LogActivity(insCtx, h.broadcaster, ActivityParams{ WorkspaceID: workspaceID, ActivityType: "a2a_receive", - SourceID: nilIfEmpty(callerID), + SourceID: callerIDToSourceID(callerID), TargetID: &workspaceID, Method: &a2aMethod, Summary: &summary, @@ -962,7 +993,7 @@ func (h *WorkspaceHandler) persistUserMessageAtIngest( LogActivity(insCtx, h.broadcaster, ActivityParams{ WorkspaceID: workspaceID, ActivityType: "a2a_receive", - SourceID: nilIfEmpty(callerID), + SourceID: callerIDToSourceID(callerID), TargetID: &workspaceID, Method: &a2aMethod, Summary: &summary, diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers_test.go b/workspace-server/internal/handlers/a2a_proxy_helpers_test.go index 12e8e0a4..df62b313 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers_test.go @@ -35,8 +35,18 @@ func TestNilIfEmpty_NonEmptyString(t *testing.T) { // "system caller" semantic via source_id IS NULL (the existing // canvas /activity?source=canvas filter) and lets the queue-fallback // path find the row by the durable message_id. +// +// Scoped helper: callerIDToSourceID. Per the Researcher's RC +// #11295 on the prior #2701 attempt, system-caller normalization +// must NOT be in nilIfEmpty itself — nilIfEmpty is also used on +// non-ID fields (Method, Summary, ErrorDetail, MessageId, +// workspace_dir), and a method name like "system:foo" is a +// legitimate value that should NOT be silently nulled. The +// normalization is therefore scoped to the ONLY field that +// actually needs it: a callerID being persisted as +// activity_logs.SourceID. -func TestNilIfEmpty_SystemCallerPrefixes(t *testing.T) { +func TestCallerIDToSourceID_SystemCallerPrefixes(t *testing.T) { cases := []string{ "system:restart-context", "webhook:github", @@ -45,7 +55,7 @@ func TestNilIfEmpty_SystemCallerPrefixes(t *testing.T) { } for _, c := range cases { t.Run(c, func(t *testing.T) { - got := nilIfEmpty(c) + got := callerIDToSourceID(c) if got != nil { t.Errorf("system caller %q: got %p (%q), want nil", c, got, *got) } @@ -53,20 +63,20 @@ func TestNilIfEmpty_SystemCallerPrefixes(t *testing.T) { } } -func TestNilIfEmpty_RealWorkspaceUUIDStillPreserved(t *testing.T) { +func TestCallerIDToSourceID_RealWorkspaceUUIDStillPreserved(t *testing.T) { // Regression guard: a real workspace UUID must pass through // unchanged. The original #2694 RC closed because the fix // accidentally collapsed real UUIDs to NULL; this case is the // one that would have caught that. cases := []string{ - "ws-1", // op-style id - "01234567-89ab-cdef-0123-456789abcdef", // uuid - "agent-dev-b", // agent id (not a system prefix) - "canvas_user", // canvas user placeholder + "ws-1", // op-style id + "01234567-89ab-cdef-0123-456789abcdef", // uuid + "agent-dev-b", // agent id (not a system prefix) + "canvas_user", // canvas user placeholder } for _, c := range cases { t.Run(c, func(t *testing.T) { - got := nilIfEmpty(c) + got := callerIDToSourceID(c) if got == nil { t.Errorf("real caller %q: got nil, want preserved pointer", c) return @@ -78,6 +88,39 @@ func TestNilIfEmpty_RealWorkspaceUUIDStillPreserved(t *testing.T) { } } +func TestCallerIDToSourceID_EmptyString(t *testing.T) { + got := callerIDToSourceID("") + if got != nil { + t.Errorf("empty callerID: got %p, want nil", got) + } +} + +// TestNilIfEmpty_NoSystemCallerNormalization guards the narrow +// contract that prompted the RC #11295 fix. nilIfEmpty is used on +// many non-ID fields (Method, Summary, ErrorDetail, MessageId, +// workspace_dir); the system-caller normalization must NOT leak +// into those callers. A method name like "system:foo" must pass +// through unchanged. +func TestNilIfEmpty_NoSystemCallerNormalization(t *testing.T) { + cases := []string{ + "system:foo", // would-be method name + "webhook:github", // would-be method name + "channel:slack:C0123", // would-be channel id + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + got := nilIfEmpty(c) + if got == nil { + t.Errorf("nilIfEmpty on %q: got nil, want preserved pointer (the system-caller normalization must be scoped to callerIDToSourceID only)", c) + return + } + if *got != c { + t.Errorf("nilIfEmpty on %q: got %q, want preserved", c, *got) + } + }) + } +} + // ───────────────────────────────────────────────────────────────────────────── // extractToolTrace tests // ───────────────────────────────────────────────────────────────────────────── -- 2.52.0