fix(a2a-proxy): normalize system-caller source_id to NULL (closes #2693, was: #2680 post-restart wedge) #2694

Closed
agent-dev-b wants to merge 1 commits from fix/restart-context-2530-callerid-normalize into main
2 changed files with 67 additions and 1 deletions
@@ -437,8 +437,26 @@ func (h *WorkspaceHandler) logA2ASuccess(ctx context.Context, workspaceID, calle
}
}
// nilIfEmpty returns nil for empty OR system-caller strings. The second
// branch matters: a system-caller prefix like "system:restart-context"
// is a non-UUID string (it's a routing marker, not a real workspace id).
// Persisting it to activity_logs.source_id would poison the column —
// downstream joins (e.g. activity.go:443 LEFT JOIN workspaces w ON w.id =
// activity_logs.source_id) and UUID-cast lookups would return NULL or
// error. Normalizing system callers to NULL source_id here keeps the
// "who-did-what" association intact (the callerID is still in
// activity_logs.caller_id as a free-form field if needed) without
// poisoning the source_id FK contract.
//
// This is the production-code root cause of #2680 (post-restart wedge):
// restart_context.go:296 calls ProxyA2ARequest with callerID =
// "system:restart-context" → nilIfEmpty("system:restart-context")
// returned a non-nil pointer to the system prefix → LogActivity
// persisted it to activity_logs.source_id as the literal string →
// downstream lookups failed → wedge-detector side-effects → workspace
// stayed degraded instead of online. See #2693.
func nilIfEmpty(s string) *string {
if s == "" {
if s == "" || isSystemCaller(s) {
return nil
}
return &s
@@ -27,6 +27,54 @@ func TestNilIfEmpty_NonEmptyString(t *testing.T) {
}
}
// System-caller prefixes (webhook:, system:, test:, channel:) are
// non-UUID routing markers, not real workspace ids. Persisting them
// to activity_logs.source_id would poison the column — downstream
// joins (e.g. activity.go:443 LEFT JOIN workspaces w ON w.id =
// activity_logs.source_id) and UUID-cast lookups would return NULL
// or error. nilIfEmpty must normalize all system-caller prefixes to
// nil. This is the production-code root cause of #2680 (post-restart
// wedge): restart_context.go:296 calls ProxyA2ARequest with callerID
// = "system:restart-context" → nilIfEmpty("system:restart-context")
// returned a non-nil pointer to the system prefix → LogActivity
// persisted it to activity_logs.source_id as the literal string →
// downstream lookups failed → wedge-detector side-effects → workspace
// stayed degraded instead of online. See #2693.
func TestNilIfEmpty_SystemCallerPrefixes(t *testing.T) {
// The four systemCallerPrefixes from a2a_proxy.go:82-84. All must
// return nil from nilIfEmpty (the post-#2680 contract). isSystemCaller
// uses strings.HasPrefix, so the prefix must be at the start.
systemPrefixes := []string{
"webhook:github",
"system:restart-context", // the actual offender
"system:other-svc",
"test:integration-1",
"channel:discord",
}
for _, p := range systemPrefixes {
got := nilIfEmpty(p)
if got != nil {
t.Errorf("system-caller %q: got non-nil pointer (would poison activity_logs.source_id), want nil", p)
}
}
}
func TestNilIfEmpty_RealWorkspaceUUIDStillPreserved(t *testing.T) {
// The fix must NOT regress the canonical case: a real workspace
// UUID (no system prefix) must STILL be passed through to
// activity_logs.source_id as a non-nil pointer. Otherwise we'd
// hide real-workspace attribution.
realUUID := "9a40df22-ba4b-3fc0-75c1-66dd6869ff25" // a real UUID-shaped string
got := nilIfEmpty(realUUID)
if got == nil {
t.Fatal("real workspace UUID: got nil, want non-nil pointer")
}
if *got != realUUID {
t.Errorf("real workspace UUID: got %q, want %q", *got, realUUID)
}
}
// ─────────────────────────────────────────────────────────────────────────────
// extractToolTrace tests
// ─────────────────────────────────────────────────────────────────────────────