diff --git a/tests/e2e/lib/completion_assert.sh b/tests/e2e/lib/completion_assert.sh index 2d763a472..8be8a0d32 100755 --- a/tests/e2e/lib/completion_assert.sh +++ b/tests/e2e/lib/completion_assert.sh @@ -57,6 +57,82 @@ a2a_completion_error_marker() { return 1 } +# redact_secrets +# Reads stdin, writes stdout with credential-looking values replaced by +# . Used by diagnostic emitters so run logs stay secret-safe. +# Covers Authorization/Bearer headers, common key names, generic *_API_KEY / +# *_TOKEN / *_SECRET values, URL query credential params, and claude-code +# SDK-style credential keys. Preserves HTTP status codes and non-secret +# error context. +redact_secrets() { + python3 "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/redact_secrets.py" +} + +# diagnose_staging_result_error +# Diagnostic-only helper for molecule-core#2712. When the canary agent +# returns a _ResultError / error-as-text payload, the RUN OUTPUT must show +# WHY the LLM/backend/runtime call failed, not just the wrapped error string. +# Emits (via redact_secrets): +# - the full A2A response JSON (so upstream HTTP status/body can be read) +# - the workspace's status, runtime_state, and last_sample_error +# - recent activity_logs rows (error_detail, status, summary) +# This does NOT change pass/fail semantics — the caller still fail()s. +diagnose_staging_result_error() { + local ws_id="$1" + local a2a_resp="$2" + local ctx="${3:-A2A}" + + log "── DIAGNOSTIC BURST ($ctx — staging LLM/backend/runtime failure) ──" + + log "Full A2A response (redacted JSON):" + { + printf '%s\n' "$a2a_resp" | python3 -m json.tool 2>/dev/null || printf '%s\n' "$a2a_resp" + } | redact_secrets + + if [ -n "$ws_id" ]; then + log "Workspace $ws_id snapshot:" + local ws_json + ws_json=$(tenant_call GET "/workspaces/$ws_id" 2>/dev/null || echo '{}') + { + printf '%s\n' "$ws_json" | python3 -c " +import json, sys +try: + d = json.load(sys.stdin) + print(' status :', d.get('status', '?')) + print(' runtime_state :', d.get('runtime_state', '?')) + print(' url :', d.get('url', '?')) + print(' last_sample_error:', (d.get('last_sample_error') or '')[:500]) +except Exception as e: + print(' (workspace JSON parse error:', e, ')') +" + } | redact_secrets 2>/dev/null || true + + log "Recent activity logs for $ws_id:" + local activity_json + activity_json=$(tenant_call GET "/activity?workspace_id=$ws_id&limit=20" 2>/dev/null || echo '[]') + { + printf '%s\n' "$activity_json" | python3 -c " +import json, sys +try: + rows = json.load(sys.stdin) + for r in rows[:10]: + ts = r.get('created_at', '?') + typ = r.get('activity_type', '?') + st = r.get('status', '?') + summ = (r.get('summary') or '')[:120] + print(f' - {ts} {typ} status={st} {summ}') + ed = r.get('error_detail') + if ed: + print(' error_detail:', str(ed)[:300]) +except Exception as e: + print(' (activity JSON parse error:', e, ')') +" + } | redact_secrets 2>/dev/null || true + fi + + log "── END DIAGNOSTIC ──" +} + # a2a_assert_real_completion # The CORE gate. Asserts the agent text: # (a) does NOT contain any error-as-text marker (broken-agent trap), AND diff --git a/tests/e2e/lib/redact_secrets.py b/tests/e2e/lib/redact_secrets.py new file mode 100644 index 000000000..83ee508ba --- /dev/null +++ b/tests/e2e/lib/redact_secrets.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +"""Redact credential-looking values from stdin to stdout. + +Used by staging E2E diagnostic emitters so run logs stay secret-safe. +Preserves HTTP status codes and non-secret error context. +""" +import re +import sys + + +def redact(text: str) -> str: + patterns = [ + # Authorization / Bearer header values (consume the whole value) + (r"(?i)(Authorization\s*[:=]\s*)[^\n]*", r"\1"), + # Bare Bearer token (e.g. in bodies or query values) + (r"(?i)(Bearer\s+)[A-Za-z0-9_\-./=]+", r"\1"), + # Known credential key names (JSON/YAML/env-style) + ( + r"(?i)(\"?(?:ANTHROPIC_AUTH_TOKEN|ANTHROPIC_API_KEY|MINIMAX_API_KEY|OPENAI_API_KEY|CLIENT_SECRET|ACCESS_TOKEN|GITHUB_TOKEN|GITEA_TOKEN|MOLECULE_[A-Z_]*_(?:TOKEN|SECRET|KEY))\"?\s*[:=]\s*\"?)[^\"\s,}\]]+", + r"\1", + ), + # Generic *_API_KEY / *_TOKEN / *_SECRET / *_AUTH_TOKEN / *_PASSWORD + ( + r"(?i)(\"?[A-Z_]*(?:API_KEY|AUTH_TOKEN|TOKEN|SECRET|PASSWORD)\"?\s*[:=]\s*\"?)[^\"\s,}\]]+", + r"\1", + ), + # URL query params that commonly carry credentials + ( + r"(?i)([?&](?:token|api[_-]?key|auth|secret|client_secret|access_token|password)=)[^&\s\"\']+", + r"\1", + ), + ] + for pat, repl in patterns: + text = re.sub(pat, repl, text) + return text + + +if __name__ == "__main__": + sys.stdout.write(redact(sys.stdin.read())) diff --git a/tests/e2e/test_completion_assert_unit.sh b/tests/e2e/test_completion_assert_unit.sh index 6024537b7..312d95741 100755 --- a/tests/e2e/test_completion_assert_unit.sh +++ b/tests/e2e/test_completion_assert_unit.sh @@ -106,6 +106,56 @@ else echo "FAIL: error marker NOT detected in 'An Exception occurred'"; FAIL=$((FAIL + 1)) fi +# ---- redact_secrets (diagnostic-output safety) ---- +redact_check() { + local desc="$1" + local input="$2" + local must_not_contain="$3" + local output + output=$(printf '%s' "$input" | redact_secrets) + if printf '%s' "$output" | grep -qF "$must_not_contain"; then + echo "FAIL: $desc — secret/token leaked in redacted output" + FAIL=$((FAIL + 1)) + else + echo "PASS: $desc (secret redacted)" + PASS=$((PASS + 1)) + fi +} + +redact_check "Authorization header value redacted" \ + "Authorization: Bearer sk-ant-abc123XYZ" \ + "sk-ant-abc123XYZ" +redact_check "known API key redacted" \ + '{"ANTHROPIC_API_KEY":"sk-ant-abc123","status":"ok"}' \ + "sk-ant-abc123" +redact_check "generic *_TOKEN redacted" \ + 'MINIMAX_API_KEY=mini-max-secret-token' \ + "mini-max-secret-token" +redact_check "URL query token redacted" \ + "https://api.example.com/v1?token=supersecrettoken&status=400" \ + "supersecrettoken" +# _ResultError diagnostic path: the runtime surfaces upstream errors as text, +# and that text can embed Authorization headers or API keys. Redaction must +# scrub them without removing the useful failure classification/status. +redact_check "_ResultError payload with embedded token redacted" \ + 'Agent error (_ResultError): HTTP 401 {\"error\":\"invalid auth\", \"Authorization\":\"Bearer sk-ant-leaked\"}' \ + "sk-ant-leaked" +if printf '%s' 'Agent error (_ResultError): HTTP 401 {"error":"invalid auth"}' | redact_secrets | grep -qF 'HTTP 401'; then + echo "PASS: _ResultError redaction preserves HTTP status context" + PASS=$((PASS + 1)) +else + echo "FAIL: _ResultError redaction stripped useful HTTP status context" + FAIL=$((FAIL + 1)) +fi +# Positive: non-secret context (HTTP status, error message) must survive. +if printf '%s' '{"status":401,"error":"invalid key"}' | redact_secrets | grep -qF '"status":401'; then + echo "PASS: redaction preserves non-secret HTTP status context" + PASS=$((PASS + 1)) +else + echo "FAIL: redaction stripped useful non-secret context" + FAIL=$((FAIL + 1)) +fi + echo "" echo "=== Results: $PASS passed, $FAIL failed ===" [ "$FAIL" -eq 0 ] diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 5d105e511..4d051ddc7 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -1350,6 +1350,15 @@ if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code MiniMax-BYOK default is the BARE registered id MiniMax-M2.7 — the colon minimax:MiniMax-M2.7 is UNREGISTERED on claude-code, internal#718) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT" fi # Generic catch-all — falls through if none of the known regressions hit. +# _ResultError is the claude-code runtime surfacing an LLM/backend/runtime +# failure AS text. Diagnose it explicitly (#2712) so the next canary run +# prints the upstream error instead of forcing operators to scrape workspace +# logs. The suite still fails; this is diagnostics-only. +if echo "$AGENT_TEXT" | grep -qiF "_ResultError"; then + diagnose_staging_result_error "$PARENT_ID" "$A2A_RESP" "A2A parent _ResultError" + _redacted_agent_text=$(printf '%s' "$AGENT_TEXT" | redact_secrets) + fail "A2A — STAGING LLM/BACKEND/RUNTIME FAILURE (_ResultError). The canary agent surfaced its LLM/backend/runtime error as a text payload. See the diagnostic burst above for the full A2A response, workspace state, and recent activity logs (including any upstream HTTP status/body the runtime reported). Raw (redacted): ${_redacted_agent_text:0:500}" +fi if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then fail "A2A returned an error-shaped response: $AGENT_TEXT" fi