From 574d1b16803965717fddf17b124f861a02a5b2d3 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 02:39:21 +0000 Subject: [PATCH] fix(a2a-proxy): normalize system-caller source_id to NULL (was: #2680 post-restart wedge) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #2680 issue (Local Provision Lifecycle E2E post-restart poll stays 'degraded' instead of 'online') has a production-code root cause separate from my test-harness fix in #2688: the synthetic 'system:restart-context' callerID was being persisted to activity_logs.source_id as a non-UUID string, poisoning the column for downstream joins and UUID-cast lookups. MECHANISM: restart_context.go:296 calls h.ProxyA2ARequest(..., 'system:restart-context', false) → a2a_proxy_helpers.go:374 isSystemCaller('system:restart-context') returns true (correctly bypasses the auth-token check; that's the system-caller design) → nilIfEmpty('system:restart-context') returned &'system:restart-context' (NON-nil pointer to the system prefix) → LogActivity persisted it to activity_logs.source_id as the literal string 'system:restart-context' → downstream joins (activity.go:443 LEFT JOIN workspaces w ON w.id = activity_logs.source_id) and UUID-cast lookups returned NULL or errored → wedge-detector side-effects → workspace stayed degraded instead of online after restart FIX: Extend nilIfEmpty to return nil for any system-caller prefix (webhook:, system:, test:, channel:), per the same systemCallerPrefixes list that isSystemCaller() uses. This normalizes the source_id to NULL for all system-caller scenarios, keeping the 'who-did-what' association intact (the callerID is still in the activity row's caller_id as a free-form field if needed) without poisoning the source_id FK contract. Pure 1-line semantic change to nilIfEmpty — no signature change, no caller update needed, no DB migration, no auth-token rotation change. The restart-context call site's callerID='system:restart-context' is preserved (so the auth bypass + logging still work) but the activity_logs.source_id row will be NULL going forward. TESTS: - TestNilIfEmpty_SystemCallerPrefixes: covers all 4 systemCallerPrefixes (webhook:, system:, test:, channel:) plus a non-prefix string (real workspace UUID must still be passed through). - TestNilIfEmpty_RealWorkspaceUUIDStillPreserved: regression guard that a real workspace UUID-shaped string still produces a non-nil pointer (otherwise we'd hide real-workspace attribution). - Existing TestNilIfEmpty_EmptyString + TestNilIfEmpty_NonEmptyString still pass (no regression on the canonical empty/non-empty cases). VERIFIED: go build ./... clean go vet ./internal/handlers/... clean go test -count=1 ./internal/handlers/... all pass (22.7s) SCOPE: minimal. 1-line semantic change to nilIfEmpty + 2 new tests. No production code change to the restart path itself; just stops the activity_logs row from being poisoned. RELATES-TO: #2680 (the post-restart degraded RCA), #2530 (the broader #2530 issue family — auth-token survival on container re-create), #2688 (my prior test-harness fix that gave the recovery path more time to clear but didn't address the source_id poison root cause). Closes #2693 (the follow-up issue I filed last tick documenting this exact root cause). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/a2a_proxy_helpers.go | 20 +++++++- .../handlers/a2a_proxy_helpers_test.go | 48 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index d9d9c8ec..afe8a69c 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -437,8 +437,26 @@ func (h *WorkspaceHandler) logA2ASuccess(ctx context.Context, workspaceID, calle } } +// nilIfEmpty returns nil for empty OR system-caller strings. The second +// branch matters: a system-caller prefix like "system:restart-context" +// is a non-UUID string (it's a routing marker, not a real workspace id). +// Persisting it to activity_logs.source_id would poison the column — +// downstream joins (e.g. activity.go:443 LEFT JOIN workspaces w ON w.id = +// activity_logs.source_id) and UUID-cast lookups would return NULL or +// error. Normalizing system callers to NULL source_id here keeps the +// "who-did-what" association intact (the callerID is still in +// activity_logs.caller_id as a free-form field if needed) without +// poisoning the source_id FK contract. +// +// This is the production-code root cause of #2680 (post-restart wedge): +// restart_context.go:296 calls ProxyA2ARequest with callerID = +// "system:restart-context" → nilIfEmpty("system:restart-context") +// returned a non-nil pointer to the system prefix → LogActivity +// persisted it to activity_logs.source_id as the literal string → +// downstream lookups failed → wedge-detector side-effects → workspace +// stayed degraded instead of online. See #2693. func nilIfEmpty(s string) *string { - if s == "" { + if s == "" || isSystemCaller(s) { return nil } return &s diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers_test.go b/workspace-server/internal/handlers/a2a_proxy_helpers_test.go index b3677cc1..cc6085e5 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers_test.go @@ -27,6 +27,54 @@ func TestNilIfEmpty_NonEmptyString(t *testing.T) { } } +// System-caller prefixes (webhook:, system:, test:, channel:) are +// non-UUID routing markers, not real workspace ids. Persisting them +// to activity_logs.source_id would poison the column — downstream +// joins (e.g. activity.go:443 LEFT JOIN workspaces w ON w.id = +// activity_logs.source_id) and UUID-cast lookups would return NULL +// or error. nilIfEmpty must normalize all system-caller prefixes to +// nil. This is the production-code root cause of #2680 (post-restart +// wedge): restart_context.go:296 calls ProxyA2ARequest with callerID +// = "system:restart-context" → nilIfEmpty("system:restart-context") +// returned a non-nil pointer to the system prefix → LogActivity +// persisted it to activity_logs.source_id as the literal string → +// downstream lookups failed → wedge-detector side-effects → workspace +// stayed degraded instead of online. See #2693. + +func TestNilIfEmpty_SystemCallerPrefixes(t *testing.T) { + // The four systemCallerPrefixes from a2a_proxy.go:82-84. All must + // return nil from nilIfEmpty (the post-#2680 contract). isSystemCaller + // uses strings.HasPrefix, so the prefix must be at the start. + systemPrefixes := []string{ + "webhook:github", + "system:restart-context", // the actual offender + "system:other-svc", + "test:integration-1", + "channel:discord", + } + for _, p := range systemPrefixes { + got := nilIfEmpty(p) + if got != nil { + t.Errorf("system-caller %q: got non-nil pointer (would poison activity_logs.source_id), want nil", p) + } + } +} + +func TestNilIfEmpty_RealWorkspaceUUIDStillPreserved(t *testing.T) { + // The fix must NOT regress the canonical case: a real workspace + // UUID (no system prefix) must STILL be passed through to + // activity_logs.source_id as a non-nil pointer. Otherwise we'd + // hide real-workspace attribution. + realUUID := "9a40df22-ba4b-3fc0-75c1-66dd6869ff25" // a real UUID-shaped string + got := nilIfEmpty(realUUID) + if got == nil { + t.Fatal("real workspace UUID: got nil, want non-nil pointer") + } + if *got != realUUID { + t.Errorf("real workspace UUID: got %q, want %q", *got, realUUID) + } +} + // ───────────────────────────────────────────────────────────────────────────── // extractToolTrace tests // ───────────────────────────────────────────────────────────────────────────── -- 2.52.0