test(e2e): diagnose _ResultError in staging canary A2A path (#2712) #2719
@@ -57,6 +57,82 @@ a2a_completion_error_marker() {
|
||||
return 1
|
||||
}
|
||||
|
||||
# redact_secrets
|
||||
# Reads stdin, writes stdout with credential-looking values replaced by
|
||||
# <REDACTED>. Used by diagnostic emitters so run logs stay secret-safe.
|
||||
# Covers Authorization/Bearer headers, common key names, generic *_API_KEY /
|
||||
# *_TOKEN / *_SECRET values, URL query credential params, and claude-code
|
||||
# SDK-style credential keys. Preserves HTTP status codes and non-secret
|
||||
# error context.
|
||||
redact_secrets() {
|
||||
python3 "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/redact_secrets.py"
|
||||
}
|
||||
|
||||
# diagnose_staging_result_error <workspace_id> <a2a_response> <context_label>
|
||||
# Diagnostic-only helper for molecule-core#2712. When the canary agent
|
||||
# returns a _ResultError / error-as-text payload, the RUN OUTPUT must show
|
||||
# WHY the LLM/backend/runtime call failed, not just the wrapped error string.
|
||||
# Emits (via redact_secrets):
|
||||
# - the full A2A response JSON (so upstream HTTP status/body can be read)
|
||||
# - the workspace's status, runtime_state, and last_sample_error
|
||||
# - recent activity_logs rows (error_detail, status, summary)
|
||||
# This does NOT change pass/fail semantics — the caller still fail()s.
|
||||
diagnose_staging_result_error() {
|
||||
local ws_id="$1"
|
||||
local a2a_resp="$2"
|
||||
local ctx="${3:-A2A}"
|
||||
|
||||
log "── DIAGNOSTIC BURST ($ctx — staging LLM/backend/runtime failure) ──"
|
||||
|
||||
log "Full A2A response (redacted JSON):"
|
||||
{
|
||||
printf '%s\n' "$a2a_resp" | python3 -m json.tool 2>/dev/null || printf '%s\n' "$a2a_resp"
|
||||
} | redact_secrets
|
||||
|
||||
if [ -n "$ws_id" ]; then
|
||||
log "Workspace $ws_id snapshot:"
|
||||
local ws_json
|
||||
ws_json=$(tenant_call GET "/workspaces/$ws_id" 2>/dev/null || echo '{}')
|
||||
{
|
||||
printf '%s\n' "$ws_json" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
d = json.load(sys.stdin)
|
||||
print(' status :', d.get('status', '?'))
|
||||
print(' runtime_state :', d.get('runtime_state', '?'))
|
||||
print(' url :', d.get('url', '?'))
|
||||
print(' last_sample_error:', (d.get('last_sample_error') or '')[:500])
|
||||
except Exception as e:
|
||||
print(' (workspace JSON parse error:', e, ')')
|
||||
"
|
||||
} | redact_secrets 2>/dev/null || true
|
||||
|
||||
log "Recent activity logs for $ws_id:"
|
||||
local activity_json
|
||||
activity_json=$(tenant_call GET "/activity?workspace_id=$ws_id&limit=20" 2>/dev/null || echo '[]')
|
||||
{
|
||||
printf '%s\n' "$activity_json" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
rows = json.load(sys.stdin)
|
||||
for r in rows[:10]:
|
||||
ts = r.get('created_at', '?')
|
||||
typ = r.get('activity_type', '?')
|
||||
st = r.get('status', '?')
|
||||
summ = (r.get('summary') or '')[:120]
|
||||
print(f' - {ts} {typ} status={st} {summ}')
|
||||
ed = r.get('error_detail')
|
||||
if ed:
|
||||
print(' error_detail:', str(ed)[:300])
|
||||
except Exception as e:
|
||||
print(' (activity JSON parse error:', e, ')')
|
||||
"
|
||||
} | redact_secrets 2>/dev/null || true
|
||||
fi
|
||||
|
||||
log "── END DIAGNOSTIC ──"
|
||||
}
|
||||
|
||||
# a2a_assert_real_completion <agent_text> <expected_token> <context_label>
|
||||
# The CORE gate. Asserts the agent text:
|
||||
# (a) does NOT contain any error-as-text marker (broken-agent trap), AND
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Redact credential-looking values from stdin to stdout.
|
||||
|
||||
Used by staging E2E diagnostic emitters so run logs stay secret-safe.
|
||||
Preserves HTTP status codes and non-secret error context.
|
||||
"""
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
def redact(text: str) -> str:
|
||||
patterns = [
|
||||
# Authorization / Bearer header values (consume the whole value)
|
||||
(r"(?i)(Authorization\s*[:=]\s*)[^\n]*", r"\1<REDACTED>"),
|
||||
# Bare Bearer token (e.g. in bodies or query values)
|
||||
(r"(?i)(Bearer\s+)[A-Za-z0-9_\-./=]+", r"\1<REDACTED>"),
|
||||
# Known credential key names (JSON/YAML/env-style)
|
||||
(
|
||||
r"(?i)(\"?(?:ANTHROPIC_AUTH_TOKEN|ANTHROPIC_API_KEY|MINIMAX_API_KEY|OPENAI_API_KEY|CLIENT_SECRET|ACCESS_TOKEN|GITHUB_TOKEN|GITEA_TOKEN|MOLECULE_[A-Z_]*_(?:TOKEN|SECRET|KEY))\"?\s*[:=]\s*\"?)[^\"\s,}\]]+",
|
||||
r"\1<REDACTED>",
|
||||
),
|
||||
# Generic *_API_KEY / *_TOKEN / *_SECRET / *_AUTH_TOKEN / *_PASSWORD
|
||||
(
|
||||
r"(?i)(\"?[A-Z_]*(?:API_KEY|AUTH_TOKEN|TOKEN|SECRET|PASSWORD)\"?\s*[:=]\s*\"?)[^\"\s,}\]]+",
|
||||
r"\1<REDACTED>",
|
||||
),
|
||||
# URL query params that commonly carry credentials
|
||||
(
|
||||
r"(?i)([?&](?:token|api[_-]?key|auth|secret|client_secret|access_token|password)=)[^&\s\"\']+",
|
||||
r"\1<REDACTED>",
|
||||
),
|
||||
]
|
||||
for pat, repl in patterns:
|
||||
text = re.sub(pat, repl, text)
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.stdout.write(redact(sys.stdin.read()))
|
||||
@@ -106,6 +106,56 @@ else
|
||||
echo "FAIL: error marker NOT detected in 'An Exception occurred'"; FAIL=$((FAIL + 1))
|
||||
fi
|
||||
|
||||
# ---- redact_secrets (diagnostic-output safety) ----
|
||||
redact_check() {
|
||||
local desc="$1"
|
||||
local input="$2"
|
||||
local must_not_contain="$3"
|
||||
local output
|
||||
output=$(printf '%s' "$input" | redact_secrets)
|
||||
if printf '%s' "$output" | grep -qF "$must_not_contain"; then
|
||||
echo "FAIL: $desc — secret/token leaked in redacted output"
|
||||
FAIL=$((FAIL + 1))
|
||||
else
|
||||
echo "PASS: $desc (secret redacted)"
|
||||
PASS=$((PASS + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
redact_check "Authorization header value redacted" \
|
||||
"Authorization: Bearer sk-ant-abc123XYZ" \
|
||||
"sk-ant-abc123XYZ"
|
||||
redact_check "known API key redacted" \
|
||||
'{"ANTHROPIC_API_KEY":"sk-ant-abc123","status":"ok"}' \
|
||||
"sk-ant-abc123"
|
||||
redact_check "generic *_TOKEN redacted" \
|
||||
'MINIMAX_API_KEY=mini-max-secret-token' \
|
||||
"mini-max-secret-token"
|
||||
redact_check "URL query token redacted" \
|
||||
"https://api.example.com/v1?token=supersecrettoken&status=400" \
|
||||
"supersecrettoken"
|
||||
# _ResultError diagnostic path: the runtime surfaces upstream errors as text,
|
||||
# and that text can embed Authorization headers or API keys. Redaction must
|
||||
# scrub them without removing the useful failure classification/status.
|
||||
redact_check "_ResultError payload with embedded token redacted" \
|
||||
'Agent error (_ResultError): HTTP 401 {\"error\":\"invalid auth\", \"Authorization\":\"Bearer sk-ant-leaked\"}' \
|
||||
"sk-ant-leaked"
|
||||
if printf '%s' 'Agent error (_ResultError): HTTP 401 {"error":"invalid auth"}' | redact_secrets | grep -qF 'HTTP 401'; then
|
||||
echo "PASS: _ResultError redaction preserves HTTP status context"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo "FAIL: _ResultError redaction stripped useful HTTP status context"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
# Positive: non-secret context (HTTP status, error message) must survive.
|
||||
if printf '%s' '{"status":401,"error":"invalid key"}' | redact_secrets | grep -qF '"status":401'; then
|
||||
echo "PASS: redaction preserves non-secret HTTP status context"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo "FAIL: redaction stripped useful non-secret context"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
[ "$FAIL" -eq 0 ]
|
||||
|
||||
@@ -1350,6 +1350,15 @@ if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
|
||||
fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code MiniMax-BYOK default is the BARE registered id MiniMax-M2.7 — the colon minimax:MiniMax-M2.7 is UNREGISTERED on claude-code, internal#718) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
|
||||
fi
|
||||
# Generic catch-all — falls through if none of the known regressions hit.
|
||||
# _ResultError is the claude-code runtime surfacing an LLM/backend/runtime
|
||||
# failure AS text. Diagnose it explicitly (#2712) so the next canary run
|
||||
# prints the upstream error instead of forcing operators to scrape workspace
|
||||
# logs. The suite still fails; this is diagnostics-only.
|
||||
if echo "$AGENT_TEXT" | grep -qiF "_ResultError"; then
|
||||
diagnose_staging_result_error "$PARENT_ID" "$A2A_RESP" "A2A parent _ResultError"
|
||||
_redacted_agent_text=$(printf '%s' "$AGENT_TEXT" | redact_secrets)
|
||||
fail "A2A — STAGING LLM/BACKEND/RUNTIME FAILURE (_ResultError). The canary agent surfaced its LLM/backend/runtime error as a text payload. See the diagnostic burst above for the full A2A response, workspace state, and recent activity logs (including any upstream HTTP status/body the runtime reported). Raw (redacted): ${_redacted_agent_text:0:500}"
|
||||
fi
|
||||
if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
|
||||
fail "A2A returned an error-shaped response: $AGENT_TEXT"
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user