fix(runtime+canvas): surface actionable provider error reason instead of opaque "Agent error (Exception)" #1420

Merged
hongming merged 7 commits from fix/issue212-actionable-agent-error-reason into staging 2026-05-18 19:28:03 +00:00
5 changed files with 188 additions and 3 deletions
@@ -67,9 +67,21 @@ export function useChatSocket(
const own = (targetId || msg.workspace_id) === workspaceId;
if (own) {
callbacksRef.current.onSendComplete?.();
callbacksRef.current.onSendError?.(
"Agent error (Exception) — see workspace logs for details.",
);
// internal#211/#212: surface the runtime's curated,
// user-actionable reason (provider HTTP status + error
// code + the provider's own guidance, e.g. a 403 "org
// disabled · use an API key / ask your admin"). The
// server now includes error_detail in the ACTIVITY_LOGGED
// broadcast; fall back to summary, and only as a last
// resort to a generic line. The old hardcoded
// "Agent error (Exception) — see workspace logs for
// details." string pointed at a logs UI that does not
// exist and discarded the actionable reason entirely.
const detail =
(p.error_detail as string) ||
(p.summary as string) ||
"The agent turn failed but the runtime reported no detail. Retry once; if it repeats the workspace runtime may need a restart.";
callbacksRef.current.onSendError?.(detail);
}
}
} else if (type === "a2a_send") {
+1
View File
@@ -62,6 +62,7 @@ TOP_LEVEL_MODULES = {
"a2a_tools_memory",
"a2a_tools_messaging",
"a2a_tools_rbac",
"a2a_tools_identity",
"adapter_base",
"agent",
"agents_md",
@@ -691,6 +691,19 @@ func logActivityExec(ctx context.Context, exec activityExecutor, broadcaster eve
if respStr != nil {
payload["response_body"] = json.RawMessage(respJSON)
}
// internal#211/#212: error_detail carries the runtime's curated,
// user-actionable, secret-safe failure reason (provider HTTP
// status + error code + the provider's own guidance, e.g. a 403
// "org disabled · use an API key / ask your admin"). It is
// already persisted to the DB column above and capped by the
// runtime's report_activity helper (4096 chars). Previously it
// was dropped from the LIVE broadcast, so the canvas had nothing
// to render and fell back to a hardcoded opaque
// "Agent error (Exception) — see workspace logs" string. Include
// it so the chat bubble shows the real reason in real time.
if params.ErrorDetail != nil && *params.ErrorDetail != "" {
payload["error_detail"] = *params.ErrorDetail
}
}
return func() {
+42
View File
@@ -599,6 +599,28 @@ def _sanitize_for_external(msg: str) -> str:
import re as _re
msg = _re.sub(r"(?i)(?:bearer|token|api[_-]?key|sk-)[ :=]+[A-Za-z0-9_/.-]{20,}", "[REDACTED]", msg)
# Bare provider key with NO separator after the prefix — a real
# `sk-ant-api03-…` / `sk-…` key uses `-` (not `[ :=]`) so the rule
# above misses it. Require ≥24 key-ish chars after the `sk-`/`sk-ant-`
# prefix so curated examples like `sk-ant-EXAMPLE-SHORT` (13 chars
# after `sk-ant-`) still pass through un-redacted.
msg = _re.sub(r"(?i)\bsk-(?:ant-)?[A-Za-z0-9_-]{24,}", "[REDACTED]", msg)
# JSON-quoted credential values: {"token": "…"} / {"apiKey": "…"} /
# {"secret": "…"} / {"password": "…"}. Redact only the value, and only
# when it is ≥24 chars so a short curated sample like
# `"api_key": "sk-ant-EXAMPLE-SHORT"` (20-char value) still passes.
msg = _re.sub(
r'(?i)("(?:token|api[_-]?key|secret|password)"\s*:\s*")[^"]{24,}(")',
r"\1[REDACTED]\2",
msg,
)
# AWS secret access key in `aws_secret_access_key=…` form (env dumps,
# boto tracebacks). The base64-ish value runs until whitespace/quote.
msg = _re.sub(
r"(?i)(aws_secret_access_key\s*[:=]\s*)\S+",
r"\1[REDACTED]",
msg,
)
# Absolute paths: /etc/shadow, /home/user/.aws/credentials, etc.
msg = _re.sub(r"(?:/[^/\s]+){2,}", lambda m: m.group(0) if len(m.group(0)) < 60 else "[REDACTED_PATH]", msg)
return msg
@@ -608,6 +630,7 @@ def sanitize_agent_error(
exc: BaseException | None = None,
category: str | None = None,
stderr: str | None = None,
reason: str | None = None,
) -> str:
"""Render an agent-side failure into a user-safe error message.
@@ -615,6 +638,18 @@ def sanitize_agent_error(
category string (e.g. from `classify_subprocess_error`). If both are
given, `category` wins. If neither, the tag defaults to "unknown".
When ``reason`` is provided (internal#211/#212), it is a *pre-curated,
user-actionable, secret-safe* explanation built by the caller from a
provider-side failure — e.g. a 403 "Your organization has disabled
Claude subscription access · Use an Anthropic API key instead, or ask
your admin to enable access" with error code ``oauth_org_not_allowed``.
This text is exactly what the user needs to self-serve, so it is
surfaced VERBATIM as the message instead of being collapsed to the
opaque exception class name. It still passes through the
key/token/bearer/path scrubber as a belt-and-braces second pass so a
buggy caller can't leak a credential that snuck into the reason.
``reason`` wins over ``stderr``; both lose to neither being set.
When ``stderr`` is provided (e.g. the first ~1 KB of a subprocess stderr
or HTTP error body), it is sanitized and appended to the output so the
A2A caller gets actionable context without needing to dig through workspace
@@ -629,6 +664,13 @@ def sanitize_agent_error(
else:
tag = "unknown"
if reason:
# Curated, user-actionable reason — surface it as the message.
# Still scrub: a 403/auth/quota message is safe, but the scrubber
# is cheap insurance against a caller that didn't curate cleanly.
clean = _sanitize_for_external(reason[:_MAX_STDERR_PREVIEW])
return f"Agent error ({tag}): {clean}"
if stderr:
# Truncate and sanitize before including — prevents DoS via
# a malicious or buggy peer injecting a huge error body, and
+117
View File
@@ -788,6 +788,123 @@ def test_sanitize_agent_error_stderr_combined_with_existing_tests():
assert "workspace logs" in out
# ─── reason passthrough (internal#211/#212: surface actionable provider error) ───
def test_sanitize_agent_error_reason_surfaced_verbatim():
"""A curated provider reason is shown to the user, not collapsed to the
exception class name. This is the internal#211 regression: a 403
org-disabled message must reach the canvas."""
reason = (
"provider HTTP 403 — oauth_org_not_allowed — Your organization has "
"disabled Claude subscription access for Claude Code · Use an "
"Anthropic API key instead, or ask your admin to enable access"
)
class _ResultErr(Exception):
pass
out = sanitize_agent_error(exc=_ResultErr("opaque"), reason=reason)
# The actionable provider guidance and status code must be visible.
assert "403" in out
assert "oauth_org_not_allowed" in out
assert "disabled Claude subscription access" in out
assert "ask your admin to enable access" in out
# NOT the old opaque form.
assert "see workspace logs" not in out
def test_sanitize_agent_error_reason_still_scrubs_secrets():
"""Even on the reason path the key/token scrubber runs — a buggy caller
that lets a bearer token into the reason still gets it redacted."""
leaky = (
"provider HTTP 401 — auth failed — Authorization: Bearer "
"PLACEHOLDER_LONG_TOKEN_0123456789abcdefghijklm please re-auth"
)
out = sanitize_agent_error(reason=leaky)
assert "[REDACTED]" in out
assert "PLACEHOLDER_LONG_TOKEN_0123456789abcdefghijklm" not in out
# The non-secret guidance still survives the scrub.
assert "401" in out
assert "please re-auth" in out
def test_sanitize_agent_error_reason_scrubs_all_secret_formats():
"""The scrubber must redact every realistic credential shape — not just
the `Bearer <tok>` form the original test happened to exercise
(internal#212 review finding: bare `sk-ant-api03-…` keys, JSON-quoted
"token"/"apiKey" values, and `aws_secret_access_key=` all leaked).
All curated/actionable guidance must still survive the scrub.
"""
# 1. Bare sk-ant-api03 key — no `[ :=]` separator after the prefix
# (a real Anthropic key uses `-`), so the legacy regex missed it.
bare = (
"provider HTTP 401 — auth failed — invalid key "
"sk-FAKEPLACEHOLDERabcdefghijklmnopqrstuvwxy0123456789 "
"please re-auth"
)
out = sanitize_agent_error(reason=bare)
assert "sk-FAKEPLACEHOLDERabcdefghijklmnopqrstuvwxy0123456789" not in out
assert "[REDACTED]" in out
assert "401" in out # actionable status survives
assert "please re-auth" in out # actionable guidance survives
# 2. JSON-quoted "token" / "apiKey" values.
jblob = (
'provider error — config dump {"token": '
'"abcDEF0123456789ghIJKL0123456789mnopQRST", "apiKey": '
'"anon_fakefakefakefakefakefakefakefakefakefake"} — '
"use an API key instead"
)
out = sanitize_agent_error(reason=jblob)
assert "abcDEF0123456789ghIJKL0123456789mnopQRST" not in out
assert "anon_fakefakefakefakefakefakefakefakefakefake" not in out
assert "[REDACTED]" in out
assert "use an API key instead" in out # actionable guidance survives
# 3. aws_secret_access_key=… form.
awsblob = (
"provider HTTP 403 — boto credential error "
"aws_secret_access_key=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY — "
"ask your admin to enable access"
)
out = sanitize_agent_error(reason=awsblob)
assert "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" not in out
assert "[REDACTED]" in out
assert "403" in out # actionable status survives
assert "ask your admin to enable access" in out # guidance survives
# 4. Regression: the original Bearer form still redacts.
# Uses PLACEHOLDER_LONG_TOKEN (>=40 chars, no sk-ant- prefix) to avoid
# triggering the secret-scan workflow pattern
# `sk-ant-[A-Za-z0-9_-]{40,}`.
bearer = (
"provider HTTP 401 — Authorization: Bearer "
"PLACEHOLDER_LONG_TOKEN_9876543210abcdefghij re-auth"
)
out = sanitize_agent_error(reason=bearer)
assert "PLACEHOLDER_LONG_TOKEN_9876543210abcdefghij" not in out
assert "[REDACTED]" in out
assert "re-auth" in out
def test_sanitize_agent_error_reason_wins_over_stderr():
"""When both reason and stderr are passed, the curated reason wins."""
out = sanitize_agent_error(
reason="provider HTTP 403 — use an API key",
stderr="raw subprocess noise that should not be shown",
)
assert "use an API key" in out
assert "raw subprocess noise" not in out
def test_sanitize_agent_error_no_reason_unchanged():
"""Omitting reason preserves the original generic behavior."""
out = sanitize_agent_error(exc=ValueError("boom"))
assert "ValueError" in out
assert "workspace logs" in out
# ======================================================================
# classify_subprocess_error