2026-05-19 01:56:11 +00:00
2 changed files with 229 additions and 0 deletions
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"log"
 	"net/http"
+	"regexp"
 	"strconv"
 	"strings"
 	"time"
@@ -18,6 +19,46 @@ import (
 	"github.com/google/uuid"
 )

+// internal#212 — secret-safe scrubber applied to error_detail strings
+// before they cross the canvas WebSocket. Defense in depth: the
+// workspace runtime already runs `_sanitize_for_external` on its side
+// (workspace/executor_helpers.py), but the broadcast layer is the last
+// stop before the string reaches the user's browser, so we re-scrub
+// here in case any caller path forgot.
+//
+// The scrubber is intentionally surgical — it MUST preserve the
+// actionable parts (HTTP status codes, error codes like
+// `oauth_org_not_allowed`, human-readable provider messages) and
+// remove only what looks credential-ish. Over-redacting defeats the
+// whole point of internal#212 (giving the user a reason they can act on).
+
+// Capture (auth-key prefix) (value) so the prefix can be preserved in
+// the output. The keyword anchor prevents false positives on regular
+// text that happens to contain a long alphanumeric run.
+var errorDetailSecretRE = regexp.MustCompile(`(?i)((?:bearer|token|api[_-]?key|sk-proj-|sk-)[ :=]*)[A-Za-z0-9_/.-]{20,}`)
+
+// Stringly-typed JWT-shape: 3 dot-separated base64url segments, second
+// and third at least 16 chars. Matches eyJ-prefixed tokens that the
+// keyword-anchored rule above would miss when they appear bare.
+var errorDetailJWTRE = regexp.MustCompile(`eyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{16,}\.[A-Za-z0-9_-]{16,}`)
+
+const errorDetailBroadcastCap = 4096
+
+func sanitizeErrorDetailForBroadcast(s string) string {
+	if s == "" {
+		return s
+	}
+	// Cap first — a huge error body shouldn't tax every websocket
+	// client's buffer. 4096 matches the workspace-side _MAX_STDERR
+	// budget (it's actually larger here so the runtime's cap dominates).
+	if len(s) > errorDetailBroadcastCap {
+		s = s[:errorDetailBroadcastCap] + "…[truncated]"
+	}
+	s = errorDetailSecretRE.ReplaceAllString(s, "${1}[REDACTED]")
+	s = errorDetailJWTRE.ReplaceAllString(s, "[REDACTED]")
+	return s
+}
+
 type ActivityHandler struct {
 	broadcaster *events.Broadcaster
 }
@@ -691,6 +732,16 @@ func logActivityExec(ctx context.Context, exec activityExecutor, broadcaster eve
 		if respStr != nil {
 			payload["response_body"] = json.RawMessage(respJSON)
 		}
+		// internal#212 — surface the secret-safe failure reason on the
+		// live broadcast so the canvas chat-tab error banner can show
+		// the user *why* (provider HTTP status, error code, the
+		// provider's own human message) instead of the opaque
+		// "Agent error (Exception) — see workspace logs for details."
+		// hardcoded fallback. Omitted when nil so the canvas's "has
+		// actionable reason" guard doesn't trip on empty-string keys.
+		if params.ErrorDetail != nil && *params.ErrorDetail != "" {
+			payload["error_detail"] = sanitizeErrorDetailForBroadcast(*params.ErrorDetail)
+		}
 	}

 	return func() {
@@ -934,6 +934,184 @@ func TestLogActivity_Broadcast_IncludesRequestAndResponseBodies(t *testing.T) {
 	}
 }

+// TestLogActivity_Broadcast_IncludesErrorDetail pins the internal#212
+// UX fix: when an a2a_receive row is logged with status="error" and a
+// non-empty error_detail, the live broadcast MUST carry that detail so
+// the canvas chat-tab error bubble can render the actionable reason
+// (e.g. the provider's own 403 message) instead of the opaque
+// "Agent error (Exception) — see workspace logs for details." string.
+// Without this, the canvas falls back to the hardcoded boilerplate;
+// the row's error_detail is in the DB but never reaches the user
+// without a manual refresh of the Activity tab.
+func TestLogActivity_Broadcast_IncludesErrorDetail(t *testing.T) {
+	mock := setupTestDB(t)
+	defer mock.ExpectationsWereMet()
+
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	cb := &recordingBroadcaster{}
+	srcID := "ws-source"
+	tgtID := "ws-target"
+	method := "message/send"
+	// Realistic actionable reason: provider HTTP status + provider's
+	// own message. Secret-safe (no token, no api key, just the cause).
+	detail := "Anthropic 403 oauth_org_not_allowed: Your organization has disabled Claude subscription access for Claude Code — use an Anthropic API key or ask your admin to enable access."
+
+	LogActivity(context.Background(), cb, ActivityParams{
+		WorkspaceID:  "ws-source",
+		ActivityType: "a2a_receive",
+		SourceID:     &srcID,
+		TargetID:     &tgtID,
+		Method:       &method,
+		Status:       "error",
+		ErrorDetail:  &detail,
+	})
+
+	if len(cb.calls) != 1 {
+		t.Fatalf("expected 1 broadcast, got %d", len(cb.calls))
+	}
+	payload := cb.calls[0].payload
+	got, ok := payload["error_detail"].(string)
+	if !ok {
+		t.Fatalf("error_detail missing from broadcast payload: got %#v", payload["error_detail"])
+	}
+	if got != detail {
+		t.Errorf("error_detail = %q, want %q", got, detail)
+	}
+}
+
+// TestLogActivity_Broadcast_OmitsErrorDetailWhenNil pins the inverse:
+// rows logged without an error_detail (the common ok-path) must not
+// have an empty "error_detail":"" key in the broadcast, which would
+// false-positive the canvas's "has actionable reason" guard and render
+// an empty Underlying-Error block. The omission rule matches how
+// request_body/response_body are handled.
+func TestLogActivity_Broadcast_OmitsErrorDetailWhenNil(t *testing.T) {
+	mock := setupTestDB(t)
+	defer mock.ExpectationsWereMet()
+
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	cb := &recordingBroadcaster{}
+	srcID := "ws-source"
+
+	LogActivity(context.Background(), cb, ActivityParams{
+		WorkspaceID:  "ws-source",
+		ActivityType: "a2a_send",
+		SourceID:     &srcID,
+		Status:       "ok",
+		ErrorDetail:  nil,
+	})
+
+	if len(cb.calls) != 1 {
+		t.Fatalf("expected 1 broadcast, got %d", len(cb.calls))
+	}
+	if _, present := cb.calls[0].payload["error_detail"]; present {
+		t.Errorf("error_detail should be omitted when nil, got %v", cb.calls[0].payload["error_detail"])
+	}
+}
+
+// TestSanitizeErrorDetail_StripsSecretShapes pins the secret-safe
+// scrubber's contract: the broadcast layer is the last defense before
+// a string crosses the canvas WebSocket and lands in the user's
+// browser, so anything that *looks* like an API key / bearer token /
+// JWT must be replaced with [REDACTED] even if upstream (the runtime,
+// the provider) failed to scrub it. The non-secret parts of the
+// message — provider status, error code, human-readable cause — MUST
+// survive intact, otherwise the whole point of internal#212 (giving
+// the user an actionable reason) is defeated.
+func TestSanitizeErrorDetail_StripsSecretShapes(t *testing.T) {
+	cases := []struct {
+		name     string
+		in       string
+		mustHave []string // substrings that must survive — the actionable parts
+		mustMiss []string // substrings that must NOT survive — the secret shapes
+	}{
+		{
+			name:     "preserves actionable provider reason",
+			in:       "Anthropic 403 oauth_org_not_allowed: Your organization has disabled Claude subscription access for Claude Code",
+			mustHave: []string{"403", "oauth_org_not_allowed", "disabled Claude subscription"},
+			mustMiss: []string{"[REDACTED]"},
+		},
+		{
+			name:     "redacts sk- API key embedded in error",
+			in:       "openai 401 invalid_api_key: Incorrect API key provided: sk-proj-abcdefghijklmnop1234567890abcdef. Check your key.",
+			mustHave: []string{"401", "invalid_api_key", "Incorrect API key provided"},
+			mustMiss: []string{"sk-proj-abcdefghijklmnop1234567890abcdef"},
+		},
+		{
+			name:     "redacts Bearer token in stringified header dump",
+			in:       "auth failed; headers: Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.aaaaaaaaaaaaaaaaaaaa.bbbbbbbbbbbbbbbbbbbb",
+			mustHave: []string{"auth failed"},
+			mustMiss: []string{"eyJhbGciOiJIUzI1NiJ9.aaaaaaaaaaaaaaaaaaaa.bbbbbbbbbbbbbbbbbbbb"},
+		},
+		{
+			name:     "truncates absurdly long detail to bound payload size",
+			in:       "kimi 500 internal_error: " + strings.Repeat("x", 8000),
+			mustHave: []string{"kimi 500 internal_error"},
+			mustMiss: []string{strings.Repeat("x", 5000)}, // 5000 in a row must NOT survive — cap is 4096
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := sanitizeErrorDetailForBroadcast(tc.in)
+			for _, s := range tc.mustHave {
+				if !strings.Contains(got, s) {
+					t.Errorf("expected %q to survive scrub, got: %q", s, got)
+				}
+			}
+			for _, s := range tc.mustMiss {
+				if strings.Contains(got, s) {
+					t.Errorf("expected %q to be scrubbed, got: %q", s, got)
+				}
+			}
+		})
+	}
+}
+
+// TestLogActivity_Broadcast_ErrorDetailIsSanitized pins the integration
+// of the scrubber into the broadcast path: if an upstream caller
+// somehow passes through an error_detail with a secret-shaped token,
+// the wire payload (what reaches the canvas WebSocket) must already
+// be scrubbed. Defense in depth — the runtime should never let this
+// happen, but the canvas is the trust boundary, not the runtime.
+func TestLogActivity_Broadcast_ErrorDetailIsSanitized(t *testing.T) {
+	mock := setupTestDB(t)
+	defer mock.ExpectationsWereMet()
+
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	cb := &recordingBroadcaster{}
+	srcID := "ws-source"
+	// Upstream leaked a token into the detail string. The DB still
+	// stores the unscrubbed copy (workspace logs are an internal
+	// audit surface), but the broadcast that reaches the canvas
+	// must already be sanitized.
+	detail := "anthropic 401 invalid_api_key: provided key sk-proj-leakedsecretvalueabcdefghij is wrong"
+
+	LogActivity(context.Background(), cb, ActivityParams{
+		WorkspaceID:  "ws-source",
+		ActivityType: "a2a_receive",
+		SourceID:     &srcID,
+		Status:       "error",
+		ErrorDetail:  &detail,
+	})
+
+	if len(cb.calls) != 1 {
+		t.Fatalf("expected 1 broadcast, got %d", len(cb.calls))
+	}
+	got, _ := cb.calls[0].payload["error_detail"].(string)
+	if strings.Contains(got, "sk-proj-leakedsecretvalueabcdefghij") {
+		t.Errorf("broadcast leaked secret-shaped token: %q", got)
+	}
+	if !strings.Contains(got, "invalid_api_key") {
+		t.Errorf("scrubber over-redacted: lost the actionable code from %q", got)
+	}
+}
+
 // TestLogActivityTx_DefersBroadcastUntilCommitHook pins the #149
 // contract: LogActivityTx returns a commitHook that the caller MUST
 // invoke after tx.Commit(); the broadcast MUST NOT fire from inside