test(e2e): diagnose _ResultError in staging canary A2A path (#2712) #2719

Merged
devops-engineer merged 1 commits from fix/2712-diagnose-staging-result-error into main 2026-06-13 08:58:38 +00:00
4 changed files with 174 additions and 0 deletions
+76
View File
@@ -57,6 +57,82 @@ a2a_completion_error_marker() {
return 1
}
# redact_secrets
# Reads stdin, writes stdout with credential-looking values replaced by
# <REDACTED>. Used by diagnostic emitters so run logs stay secret-safe.
# Covers Authorization/Bearer headers, common key names, generic *_API_KEY /
# *_TOKEN / *_SECRET values, URL query credential params, and claude-code
# SDK-style credential keys. Preserves HTTP status codes and non-secret
# error context.
redact_secrets() {
python3 "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/redact_secrets.py"
}
# diagnose_staging_result_error <workspace_id> <a2a_response> <context_label>
# Diagnostic-only helper for molecule-core#2712. When the canary agent
# returns a _ResultError / error-as-text payload, the RUN OUTPUT must show
# WHY the LLM/backend/runtime call failed, not just the wrapped error string.
# Emits (via redact_secrets):
# - the full A2A response JSON (so upstream HTTP status/body can be read)
# - the workspace's status, runtime_state, and last_sample_error
# - recent activity_logs rows (error_detail, status, summary)
# This does NOT change pass/fail semantics — the caller still fail()s.
diagnose_staging_result_error() {
local ws_id="$1"
local a2a_resp="$2"
local ctx="${3:-A2A}"
log "── DIAGNOSTIC BURST ($ctx — staging LLM/backend/runtime failure) ──"
log "Full A2A response (redacted JSON):"
{
printf '%s\n' "$a2a_resp" | python3 -m json.tool 2>/dev/null || printf '%s\n' "$a2a_resp"
} | redact_secrets
if [ -n "$ws_id" ]; then
log "Workspace $ws_id snapshot:"
local ws_json
ws_json=$(tenant_call GET "/workspaces/$ws_id" 2>/dev/null || echo '{}')
{
printf '%s\n' "$ws_json" | python3 -c "
import json, sys
try:
d = json.load(sys.stdin)
print(' status :', d.get('status', '?'))
print(' runtime_state :', d.get('runtime_state', '?'))
print(' url :', d.get('url', '?'))
print(' last_sample_error:', (d.get('last_sample_error') or '')[:500])
except Exception as e:
print(' (workspace JSON parse error:', e, ')')
"
} | redact_secrets 2>/dev/null || true
log "Recent activity logs for $ws_id:"
local activity_json
activity_json=$(tenant_call GET "/activity?workspace_id=$ws_id&limit=20" 2>/dev/null || echo '[]')
{
printf '%s\n' "$activity_json" | python3 -c "
import json, sys
try:
rows = json.load(sys.stdin)
for r in rows[:10]:
ts = r.get('created_at', '?')
typ = r.get('activity_type', '?')
st = r.get('status', '?')
summ = (r.get('summary') or '')[:120]
print(f' - {ts} {typ} status={st} {summ}')
ed = r.get('error_detail')
if ed:
print(' error_detail:', str(ed)[:300])
except Exception as e:
print(' (activity JSON parse error:', e, ')')
"
} | redact_secrets 2>/dev/null || true
fi
log "── END DIAGNOSTIC ──"
}
# a2a_assert_real_completion <agent_text> <expected_token> <context_label>
# The CORE gate. Asserts the agent text:
# (a) does NOT contain any error-as-text marker (broken-agent trap), AND
+39
View File
@@ -0,0 +1,39 @@
#!/usr/bin/env python3
"""Redact credential-looking values from stdin to stdout.
Used by staging E2E diagnostic emitters so run logs stay secret-safe.
Preserves HTTP status codes and non-secret error context.
"""
import re
import sys
def redact(text: str) -> str:
patterns = [
# Authorization / Bearer header values (consume the whole value)
(r"(?i)(Authorization\s*[:=]\s*)[^\n]*", r"\1<REDACTED>"),
# Bare Bearer token (e.g. in bodies or query values)
(r"(?i)(Bearer\s+)[A-Za-z0-9_\-./=]+", r"\1<REDACTED>"),
# Known credential key names (JSON/YAML/env-style)
(
r"(?i)(\"?(?:ANTHROPIC_AUTH_TOKEN|ANTHROPIC_API_KEY|MINIMAX_API_KEY|OPENAI_API_KEY|CLIENT_SECRET|ACCESS_TOKEN|GITHUB_TOKEN|GITEA_TOKEN|MOLECULE_[A-Z_]*_(?:TOKEN|SECRET|KEY))\"?\s*[:=]\s*\"?)[^\"\s,}\]]+",
r"\1<REDACTED>",
),
# Generic *_API_KEY / *_TOKEN / *_SECRET / *_AUTH_TOKEN / *_PASSWORD
(
r"(?i)(\"?[A-Z_]*(?:API_KEY|AUTH_TOKEN|TOKEN|SECRET|PASSWORD)\"?\s*[:=]\s*\"?)[^\"\s,}\]]+",
r"\1<REDACTED>",
),
# URL query params that commonly carry credentials
(
r"(?i)([?&](?:token|api[_-]?key|auth|secret|client_secret|access_token|password)=)[^&\s\"\']+",
r"\1<REDACTED>",
),
]
for pat, repl in patterns:
text = re.sub(pat, repl, text)
return text
if __name__ == "__main__":
sys.stdout.write(redact(sys.stdin.read()))
+50
View File
@@ -106,6 +106,56 @@ else
echo "FAIL: error marker NOT detected in 'An Exception occurred'"; FAIL=$((FAIL + 1))
fi
# ---- redact_secrets (diagnostic-output safety) ----
redact_check() {
local desc="$1"
local input="$2"
local must_not_contain="$3"
local output
output=$(printf '%s' "$input" | redact_secrets)
if printf '%s' "$output" | grep -qF "$must_not_contain"; then
echo "FAIL: $desc — secret/token leaked in redacted output"
FAIL=$((FAIL + 1))
else
echo "PASS: $desc (secret redacted)"
PASS=$((PASS + 1))
fi
}
redact_check "Authorization header value redacted" \
"Authorization: Bearer sk-ant-abc123XYZ" \
"sk-ant-abc123XYZ"
redact_check "known API key redacted" \
'{"ANTHROPIC_API_KEY":"sk-ant-abc123","status":"ok"}' \
"sk-ant-abc123"
redact_check "generic *_TOKEN redacted" \
'MINIMAX_API_KEY=mini-max-secret-token' \
"mini-max-secret-token"
redact_check "URL query token redacted" \
"https://api.example.com/v1?token=supersecrettoken&status=400" \
"supersecrettoken"
# _ResultError diagnostic path: the runtime surfaces upstream errors as text,
# and that text can embed Authorization headers or API keys. Redaction must
# scrub them without removing the useful failure classification/status.
redact_check "_ResultError payload with embedded token redacted" \
'Agent error (_ResultError): HTTP 401 {\"error\":\"invalid auth\", \"Authorization\":\"Bearer sk-ant-leaked\"}' \
"sk-ant-leaked"
if printf '%s' 'Agent error (_ResultError): HTTP 401 {"error":"invalid auth"}' | redact_secrets | grep -qF 'HTTP 401'; then
echo "PASS: _ResultError redaction preserves HTTP status context"
PASS=$((PASS + 1))
else
echo "FAIL: _ResultError redaction stripped useful HTTP status context"
FAIL=$((FAIL + 1))
fi
# Positive: non-secret context (HTTP status, error message) must survive.
if printf '%s' '{"status":401,"error":"invalid key"}' | redact_secrets | grep -qF '"status":401'; then
echo "PASS: redaction preserves non-secret HTTP status context"
PASS=$((PASS + 1))
else
echo "FAIL: redaction stripped useful non-secret context"
FAIL=$((FAIL + 1))
fi
echo ""
echo "=== Results: $PASS passed, $FAIL failed ==="
[ "$FAIL" -eq 0 ]
+9
View File
@@ -1350,6 +1350,15 @@ if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code MiniMax-BYOK default is the BARE registered id MiniMax-M2.7 — the colon minimax:MiniMax-M2.7 is UNREGISTERED on claude-code, internal#718) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
fi
# Generic catch-all — falls through if none of the known regressions hit.
# _ResultError is the claude-code runtime surfacing an LLM/backend/runtime
# failure AS text. Diagnose it explicitly (#2712) so the next canary run
# prints the upstream error instead of forcing operators to scrape workspace
# logs. The suite still fails; this is diagnostics-only.
if echo "$AGENT_TEXT" | grep -qiF "_ResultError"; then
diagnose_staging_result_error "$PARENT_ID" "$A2A_RESP" "A2A parent _ResultError"
_redacted_agent_text=$(printf '%s' "$AGENT_TEXT" | redact_secrets)
fail "A2A — STAGING LLM/BACKEND/RUNTIME FAILURE (_ResultError). The canary agent surfaced its LLM/backend/runtime error as a text payload. See the diagnostic burst above for the full A2A response, workspace state, and recent activity logs (including any upstream HTTP status/body the runtime reported). Raw (redacted): ${_redacted_agent_text:0:500}"
fi
if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
fail "A2A returned an error-shaped response: $AGENT_TEXT"
fi