2026-06-13 08:58:38 +00:00
4 changed files with 174 additions and 0 deletions
@@ -57,6 +57,82 @@ a2a_completion_error_marker() {
  return 1
 }

+# redact_secrets
+#   Reads stdin, writes stdout with credential-looking values replaced by
+#   <REDACTED>. Used by diagnostic emitters so run logs stay secret-safe.
+#   Covers Authorization/Bearer headers, common key names, generic *_API_KEY /
+#   *_TOKEN / *_SECRET values, URL query credential params, and claude-code
+#   SDK-style credential keys. Preserves HTTP status codes and non-secret
+#   error context.
+redact_secrets() {
+  python3 "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/redact_secrets.py"
+}
+
+# diagnose_staging_result_error <workspace_id> <a2a_response> <context_label>
+#   Diagnostic-only helper for molecule-core#2712. When the canary agent
+#   returns a _ResultError / error-as-text payload, the RUN OUTPUT must show
+#   WHY the LLM/backend/runtime call failed, not just the wrapped error string.
+#   Emits (via redact_secrets):
+#     - the full A2A response JSON (so upstream HTTP status/body can be read)
+#     - the workspace's status, runtime_state, and last_sample_error
+#     - recent activity_logs rows (error_detail, status, summary)
+#   This does NOT change pass/fail semantics — the caller still fail()s.
+diagnose_staging_result_error() {
+  local ws_id="$1"
+  local a2a_resp="$2"
+  local ctx="${3:-A2A}"
+
+  log "── DIAGNOSTIC BURST ($ctx — staging LLM/backend/runtime failure) ──"
+
+  log "Full A2A response (redacted JSON):"
+  {
+    printf '%s\n' "$a2a_resp" | python3 -m json.tool 2>/dev/null || printf '%s\n' "$a2a_resp"
+  } | redact_secrets
+
+  if [ -n "$ws_id" ]; then
+    log "Workspace $ws_id snapshot:"
+    local ws_json
+    ws_json=$(tenant_call GET "/workspaces/$ws_id" 2>/dev/null || echo '{}')
+    {
+      printf '%s\n' "$ws_json" | python3 -c "
+import json, sys
+try:
+    d = json.load(sys.stdin)
+    print('  status          :', d.get('status', '?'))
+    print('  runtime_state   :', d.get('runtime_state', '?'))
+    print('  url             :', d.get('url', '?'))
+    print('  last_sample_error:', (d.get('last_sample_error') or '')[:500])
+except Exception as e:
+    print('  (workspace JSON parse error:', e, ')')
+"
+    } | redact_secrets 2>/dev/null || true
+
+    log "Recent activity logs for $ws_id:"
+    local activity_json
+    activity_json=$(tenant_call GET "/activity?workspace_id=$ws_id&limit=20" 2>/dev/null || echo '[]')
+    {
+      printf '%s\n' "$activity_json" | python3 -c "
+import json, sys
+try:
+    rows = json.load(sys.stdin)
+    for r in rows[:10]:
+        ts = r.get('created_at', '?')
+        typ = r.get('activity_type', '?')
+        st = r.get('status', '?')
+        summ = (r.get('summary') or '')[:120]
+        print(f'  - {ts} {typ} status={st} {summ}')
+        ed = r.get('error_detail')
+        if ed:
+            print('    error_detail:', str(ed)[:300])
+except Exception as e:
+    print('  (activity JSON parse error:', e, ')')
+"
+    } | redact_secrets 2>/dev/null || true
+  fi
+
+  log "── END DIAGNOSTIC ──"
+}
+
 # a2a_assert_real_completion <agent_text> <expected_token> <context_label>
 #   The CORE gate. Asserts the agent text:
 #     (a) does NOT contain any error-as-text marker (broken-agent trap), AND
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""Redact credential-looking values from stdin to stdout.
+
+Used by staging E2E diagnostic emitters so run logs stay secret-safe.
+Preserves HTTP status codes and non-secret error context.
+"""
+import re
+import sys
+
+
+def redact(text: str) -> str:
+    patterns = [
+        # Authorization / Bearer header values (consume the whole value)
+        (r"(?i)(Authorization\s*[:=]\s*)[^\n]*", r"\1<REDACTED>"),
+        # Bare Bearer token (e.g. in bodies or query values)
+        (r"(?i)(Bearer\s+)[A-Za-z0-9_\-./=]+", r"\1<REDACTED>"),
+        # Known credential key names (JSON/YAML/env-style)
+        (
+            r"(?i)(\"?(?:ANTHROPIC_AUTH_TOKEN|ANTHROPIC_API_KEY|MINIMAX_API_KEY|OPENAI_API_KEY|CLIENT_SECRET|ACCESS_TOKEN|GITHUB_TOKEN|GITEA_TOKEN|MOLECULE_[A-Z_]*_(?:TOKEN|SECRET|KEY))\"?\s*[:=]\s*\"?)[^\"\s,}\]]+",
+            r"\1<REDACTED>",
+        ),
+        # Generic *_API_KEY / *_TOKEN / *_SECRET / *_AUTH_TOKEN / *_PASSWORD
+        (
+            r"(?i)(\"?[A-Z_]*(?:API_KEY|AUTH_TOKEN|TOKEN|SECRET|PASSWORD)\"?\s*[:=]\s*\"?)[^\"\s,}\]]+",
+            r"\1<REDACTED>",
+        ),
+        # URL query params that commonly carry credentials
+        (
+            r"(?i)([?&](?:token|api[_-]?key|auth|secret|client_secret|access_token|password)=)[^&\s\"\']+",
+            r"\1<REDACTED>",
+        ),
+    ]
+    for pat, repl in patterns:
+        text = re.sub(pat, repl, text)
+    return text
+
+
+if __name__ == "__main__":
+    sys.stdout.write(redact(sys.stdin.read()))
@@ -106,6 +106,56 @@ else
  echo "FAIL: error marker NOT detected in 'An Exception occurred'"; FAIL=$((FAIL + 1))
 fi

+# ---- redact_secrets (diagnostic-output safety) ----
+redact_check() {
+  local desc="$1"
+  local input="$2"
+  local must_not_contain="$3"
+  local output
+  output=$(printf '%s' "$input" | redact_secrets)
+  if printf '%s' "$output" | grep -qF "$must_not_contain"; then
+    echo "FAIL: $desc — secret/token leaked in redacted output"
+    FAIL=$((FAIL + 1))
+  else
+    echo "PASS: $desc (secret redacted)"
+    PASS=$((PASS + 1))
+  fi
+}
+
+redact_check "Authorization header value redacted" \
+  "Authorization: Bearer sk-ant-abc123XYZ" \
+  "sk-ant-abc123XYZ"
+redact_check "known API key redacted" \
+  '{"ANTHROPIC_API_KEY":"sk-ant-abc123","status":"ok"}' \
+  "sk-ant-abc123"
+redact_check "generic *_TOKEN redacted" \
+  'MINIMAX_API_KEY=mini-max-secret-token' \
+  "mini-max-secret-token"
+redact_check "URL query token redacted" \
+  "https://api.example.com/v1?token=supersecrettoken&status=400" \
+  "supersecrettoken"
+# _ResultError diagnostic path: the runtime surfaces upstream errors as text,
+# and that text can embed Authorization headers or API keys. Redaction must
+# scrub them without removing the useful failure classification/status.
+redact_check "_ResultError payload with embedded token redacted" \
+  'Agent error (_ResultError): HTTP 401 {\"error\":\"invalid auth\", \"Authorization\":\"Bearer sk-ant-leaked\"}' \
+  "sk-ant-leaked"
+if printf '%s' 'Agent error (_ResultError): HTTP 401 {"error":"invalid auth"}' | redact_secrets | grep -qF 'HTTP 401'; then
+  echo "PASS: _ResultError redaction preserves HTTP status context"
+  PASS=$((PASS + 1))
+else
+  echo "FAIL: _ResultError redaction stripped useful HTTP status context"
+  FAIL=$((FAIL + 1))
+fi
+# Positive: non-secret context (HTTP status, error message) must survive.
+if printf '%s' '{"status":401,"error":"invalid key"}' | redact_secrets | grep -qF '"status":401'; then
+  echo "PASS: redaction preserves non-secret HTTP status context"
+  PASS=$((PASS + 1))
+else
+  echo "FAIL: redaction stripped useful non-secret context"
+  FAIL=$((FAIL + 1))
+fi
+
 echo ""
 echo "=== Results: $PASS passed, $FAIL failed ==="
 [ "$FAIL" -eq 0 ]
@@ -1350,6 +1350,15 @@ if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
  fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code MiniMax-BYOK default is the BARE registered id MiniMax-M2.7 — the colon minimax:MiniMax-M2.7 is UNREGISTERED on claude-code, internal#718) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
 fi
 # Generic catch-all — falls through if none of the known regressions hit.
+# _ResultError is the claude-code runtime surfacing an LLM/backend/runtime
+# failure AS text. Diagnose it explicitly (#2712) so the next canary run
+# prints the upstream error instead of forcing operators to scrape workspace
+# logs. The suite still fails; this is diagnostics-only.
+if echo "$AGENT_TEXT" | grep -qiF "_ResultError"; then
+  diagnose_staging_result_error "$PARENT_ID" "$A2A_RESP" "A2A parent _ResultError"
+  _redacted_agent_text=$(printf '%s' "$AGENT_TEXT" | redact_secrets)
+  fail "A2A — STAGING LLM/BACKEND/RUNTIME FAILURE (_ResultError). The canary agent surfaced its LLM/backend/runtime error as a text payload. See the diagnostic burst above for the full A2A response, workspace state, and recent activity logs (including any upstream HTTP status/body the runtime reported). Raw (redacted): ${_redacted_agent_text:0:500}"
+fi
 if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
  fail "A2A returned an error-shaped response: $AGENT_TEXT"
 fi