From fc33cf11318aab26a19ca785cd9c6132384d82ea Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sat, 2 May 2026 02:33:00 -0700 Subject: [PATCH 1/2] docs(a2a): correct misleading v1-tolerance comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to PR #2509/#2510. The defensive v1-detection branches in extract_attached_files (Python) and extractFilesFromTask (TypeScript) were merged with comments claiming they fix a "v0→v1 silent-drop" bug that surfaced as the 2026-05-01 hongming "no text content" incident. Live test disproved that hypothesis: a2a-sdk's JSON-RPC layer validates inbound requests against the v0 Pydantic union, so v1 shapes are rejected at the request boundary — the v1 detection branch is unreachable on the JSON-RPC ingress path. The actual root cause of the hongming incident was the missing /workspace chown fixed by CP PR #381 + test #382. Update the comments to honestly describe these branches as defensive future-proofing (kept against an eventual SDK schema migration or in-process callers that construct Parts directly from protobuf), not as fixes for an observed bug. Also trims ChatTab.tsx's outbound-shape comment block from ~21 lines to a 3-line pointer to the SDK union. Comment-only change. No behavior change. 86 workspace tests + 91 canvas tests still pass. --- canvas/src/components/tabs/ChatTab.tsx | 27 +++--------- .../components/tabs/chat/message-parser.ts | 16 +++---- workspace/executor_helpers.py | 43 +++++++++++-------- 3 files changed, 38 insertions(+), 48 deletions(-) diff --git a/canvas/src/components/tabs/ChatTab.tsx b/canvas/src/components/tabs/ChatTab.tsx index 54f518b3..2fc0aedb 100644 --- a/canvas/src/components/tabs/ChatTab.tsx +++ b/canvas/src/components/tabs/ChatTab.tsx @@ -32,15 +32,11 @@ interface A2AFileRef { bytes?: string; size?: number; } -// A2A Part — outbound matches the v0 Pydantic discriminated-union -// shape that a2a-sdk's JSON-RPC layer validates against (TextPart | -// FilePart | DataPart). The v1 flat-protobuf shape `{url, filename, -// mediaType}` is internal SDK serialization only; sending it on the -// wire fails Pydantic validation with `TextPart.text required, -// FilePart.file required, DataPart.data required` and never reaches -// the executor. Inbound also tolerates the v1 shape via -// message-parser.ts since the agent itself may serialize as v1 in -// some downstream tools. +// Outbound shape matches a2a-sdk's JSON-RPC `SendMessageRequest` +// Pydantic union (TextPart | FilePart | DataPart). The flat +// protobuf shape `{url, filename, mediaType}` is rejected at the +// request boundary with `Field required` errors — keep this +// outbound shape unless a2a-sdk migrates the JSON-RPC schema. interface A2APart { kind: string; text?: string; @@ -511,18 +507,7 @@ function MyChatPanel({ workspaceId, data }: Props) { // A2A parts: text part (if any) + file parts (per attachment). The // agent sees both in a single turn, matching the A2A spec shape. - // - // File parts use the v0 discriminated-union shape `{kind:"file", - // file:{...}}` because that's what a2a-sdk's JSON-RPC layer - // validates against (`SendMessageRequest.params.message.parts[]` - // → `TextPart | FilePart | DataPart` Pydantic union). Sending the - // v1 flat shape `{url, filename, mediaType}` returns - // `Invalid Request — TextPart.text required, FilePart.file - // required, DataPart.data required` and the message never - // reaches the executor. v1 protobuf is internal serialization - // only; the wire shape stays v0 until the SDK migrates the - // JSON-RPC schema. Text parts keep `{kind:"text", text}` for the - // same reason. + // Wire shape is v0 — see A2APart definition above. const parts: A2APart[] = []; if (text) parts.push({ kind: "text", text }); for (const att of uploaded) { diff --git a/canvas/src/components/tabs/chat/message-parser.ts b/canvas/src/components/tabs/chat/message-parser.ts index 5c8cc6b6..d21842d0 100644 --- a/canvas/src/components/tabs/chat/message-parser.ts +++ b/canvas/src/components/tabs/chat/message-parser.ts @@ -41,15 +41,15 @@ export interface ParsedFilePart { /** Extract file parts from an A2A response. Walks parts[] + artifacts[]. * - * Tolerates both A2A protocol generations: - * - v0 (Pydantic): `{ kind: "file", file: { name, mimeType, uri } }` - * - v1 (protobuf): `{ url, filename, mediaType }` — flat, no `kind` - * and no nested `file` object (the v1 Part's content oneof is - * `{text, raw, url, data}`; file metadata sits at top level). + * Hot path: v0 Pydantic shape `{ kind: "file", file: { name, mimeType, + * uri } }` — what every current workspace runtime emits. * - * Without v1 tolerance, agents that emit the v1 shape (every workspace - * runtime since the SDK migration) silently drop file parts in chat — - * the agent says "I sent the file" but the user never sees the chip. + * Defensive secondary path: v1 protobuf shape `{ url, filename, + * mediaType }` — flat, no `kind`, no nested `file`. Not currently + * observed on the wire (a2a-sdk's JSON-RPC layer still validates + * against v0), but kept so a future SDK release that flips the wire + * shape, or a third-party agent that round-trips through protobuf + * serialization, doesn't silently lose file chips. * * We only surface parts that carry a URL — inline bytes would require * a different renderer (data URL) and are out of scope for MVP. Names diff --git a/workspace/executor_helpers.py b/workspace/executor_helpers.py index d3f9d00a..95ac65fc 100644 --- a/workspace/executor_helpers.py +++ b/workspace/executor_helpers.py @@ -844,23 +844,27 @@ def resolve_attachment_uri(uri: str) -> str | None: def extract_attached_files(message: Any) -> list[dict[str, str]]: """Pull ``{name, mime_type, path}`` dicts out of an A2A message. - Tolerates three Part shapes seen in the wild: + Tolerates three Part shapes: 1. a2a-sdk v0 Pydantic RootModel — ``part.root.kind == 'file'`` with - ``part.root.file.{uri,name,mimeType}``. - 2. a2a-sdk v0 flatter shape — ``part.kind == 'file'`` with - ``part.file.{uri,name,mimeType}`` (some hand-built callers). - 3. a2a-sdk v1 protobuf — ``part.url`` non-empty with - ``part.filename`` + ``part.media_type``. The v1 ``Part`` proto - has no ``kind`` field at all (the discriminator is now a oneof - ``content`` of {text, raw, url, data}). Without this branch a v1 - file part — which is what a v1 server constructs from any caller - that JSON-encodes the v1 shape — silently parses to an empty - Part on the v0→v1 transition because protobuf json_format with - ``ignore_unknown_fields=True`` drops the legacy ``kind`` and - ``file`` keys, surfacing as the user-visible - "Error: message contained no text content" on image-only chats - (2026-05-01 hongming incident). + ``part.root.file.{uri,name,mimeType}``. The hot path; this is + what every current caller produces (canvas chat, A2A peer + delegations, agent self-attached files). + 2. v0 flatter shape — ``part.kind == 'file'`` with + ``part.file.{uri,name,mimeType}``. Some hand-built callers + (older test fixtures, third-party clients) emit this. + 3. v1 protobuf — ``part.url`` non-empty with ``part.filename`` + + ``part.media_type``. **Defensive future-proofing only.** The + v1 ``Part`` proto exists in a2a-sdk's ``a2a.types.a2a_pb2`` but + a2a-sdk's JSON-RPC layer still validates inbound requests + against the v0 Pydantic discriminated union (TextPart | + FilePart | DataPart), so a v1 wire shape is rejected at the + request boundary today — this branch is unreachable on the + JSON-RPC ingress path. Kept so a future SDK release that + flips the JSON-RPC schema doesn't silently regress this + helper, and so non-conformant in-process callers (e.g. a + template that constructs a Part directly from protobuf) get + handled correctly. Non-file parts and files with unresolvable URIs are skipped — the caller sees an empty list rather than a mix of valid and broken @@ -884,10 +888,11 @@ def extract_attached_files(message: Any) -> list[dict[str, str]]: name = getattr(f, "name", "") or "" mime = getattr(f, "mimeType", None) or getattr(f, "mime_type", None) or "" else: - # v1 protobuf Part has no `kind`; detect by a non-empty - # `url` (the file/url-of-bytes oneof slot). Fall back to - # `media_type` then `mimeType` for the camelCase Pydantic - # variant some adapters still hand us. + # Defensive v1 path (see docstring): v1 Part has no `kind`, + # detect by a non-empty `url` (the file/url-of-bytes oneof + # slot). Fall back from snake_case `media_type` to + # camelCase `mediaType` for callers that hand us the + # Pydantic-style attribute name. v1_url = getattr(part, "url", "") or "" if not v1_url: continue From 8bf29b7d0e8a3fb5f3cd59e5b8adb02b46e55f93 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sat, 2 May 2026 02:35:46 -0700 Subject: [PATCH 2/2] fix(sweep-cf-tunnels): parallelize deletes + raise workflow timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hourly Sweep stale Cloudflare Tunnels job got cancelled mid-cleanup on 2026-05-02 (run 25248788312, killed at 5min after deleting 424/672 stale tunnels). A second manual dispatch finished the remaining 254 fine, so the immediate backlog cleared, but two underlying bugs would re-trip on the next big cleanup. Bug 1: serial delete loop. The execute branch was a `while read; do curl -X DELETE; done` pipeline at ~0.7s/tunnel — fine for the steady-state cleanup of a handful, but a 600+ backlog needs ~7-8min. This commit fans out to $SWEEP_CONCURRENCY (default 8) workers via `xargs -P 8 -L 1 -I {} bash -c '...' _ {} < "$DELETE_PLAN"`. With 8x parallelism the same 600+ list drains in ~60s. Notes: - We use stdin (`<`) not GNU's `xargs -a FILE` so the script stays portable to BSD xargs (matters for local-runner testing on macOS). - We pass ONLY the tunnel id on argv. xargs tokenizes on whitespace by default; tab-separating id+name on argv risks mangling. The name is kept in a side-channel id->name map ($NAME_MAP) and looked up by the worker only on failure, for FAIL_LOG readability. - Workers print exactly `OK` or `FAIL` on stdout; tally with `grep -c '^OK$' / '^FAIL$'`. - On non-zero FAILED, log the first 20 lines of $FAIL_LOG as "Failure detail (first 20):" — same diagnostic surface as before but consolidated so we don't spam logs on a flaky CF API. Bug 2: workflow's 5-min cap was set as a hangs-detector but turned out to be a real-job-too-slow detector. Raised to 30 min — generous headroom for the ~60s steady-state run while still surfacing genuine hangs (and in line with the sweep-cf-orphans companion job). Bug 3 (drive-by): the existing trap was `trap 'rm -rf "$PAGES_DIR"' EXIT`, which would have been silently overwritten by any later trap registration. Replaced with a single `cleanup()` function that wipes PAGES_DIR + all four new tempfiles (DELETE_PLAN, NAME_MAP, FAIL_LOG, RESULT_LOG), called once via `trap cleanup EXIT`. Verification: - bash -n scripts/ops/sweep-cf-tunnels.sh: clean - shellcheck -S warning scripts/ops/sweep-cf-tunnels.sh: clean - python3 yaml.safe_load on the workflow: clean - Synthetic 30-line delete plan with every 7th id sentinel'd to return {"success":false}: TEST PASS, DELETED=26 FAILED=4, FAIL_LOG side-channel name lookup verified. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/sweep-cf-tunnels.yml | 20 ++++-- scripts/ops/sweep-cf-tunnels.sh | 97 +++++++++++++++++++++----- 2 files changed, 96 insertions(+), 21 deletions(-) diff --git a/.github/workflows/sweep-cf-tunnels.yml b/.github/workflows/sweep-cf-tunnels.yml index 3d29b44e..ae99eb07 100644 --- a/.github/workflows/sweep-cf-tunnels.yml +++ b/.github/workflows/sweep-cf-tunnels.yml @@ -47,10 +47,22 @@ jobs: sweep: name: Sweep CF tunnels runs-on: ubuntu-latest - # 5 min surfaces hangs (CF API stall, slow pagination on busy - # accounts). Realistic worst case is ~3 min: 2 CP curls + N CF - # list pages + N×CF-DELETE, each capped at 10-15s by curl -m. - timeout-minutes: 5 + # 30 min cap. Was 5 min on the theory that the only thing that + # could take >5min is a CF-API hang — but on 2026-05-02 a backlog + # of 672 stale tunnels accumulated (large staging E2E run + delayed + # sweep) and the serial `curl -X DELETE` loop (~0.7s/tunnel) needed + # ~7-8min to drain. The 5-min cap killed the run mid-sweep + # (cancelled at 424/672, see run 25248788312); a manual rerun + # finished the remainder fine. + # + # The fix is two-part: parallelize the delete loop (8-way xargs in + # the script — see scripts/ops/sweep-cf-tunnels.sh), AND raise the + # cap so a one-off backlog doesn't trip a hangs-detector that + # turned out to be a real-job-too-slow detector. With 8-way + # parallelism, 600+ tunnels drains in ~60s; 30 min is generous + # headroom for actual hangs to still surface (and is in line with + # the sweep-cf-orphans companion job). + timeout-minutes: 30 env: CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }} CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} diff --git a/scripts/ops/sweep-cf-tunnels.sh b/scripts/ops/sweep-cf-tunnels.sh index 826bd961..bf948940 100755 --- a/scripts/ops/sweep-cf-tunnels.sh +++ b/scripts/ops/sweep-cf-tunnels.sh @@ -102,7 +102,22 @@ log "Fetching Cloudflare tunnels..." # `python3: Argument list too long`. Disk-buffering also makes the # accumulator O(n) instead of O(n^2). PAGES_DIR=$(mktemp -d -t cf-tunnels-XXXXXX) -trap 'rm -rf "$PAGES_DIR"' EXIT +# Single cleanup() covering all tempfiles created downstream +# ($DELETE_PLAN, $NAME_MAP, $FAIL_LOG, $RESULT_LOG). One trap call so a +# later `trap '...' EXIT` doesn't silently overwrite an earlier one. +DELETE_PLAN="" +NAME_MAP="" +FAIL_LOG="" +RESULT_LOG="" +cleanup() { + rm -rf "$PAGES_DIR" + [ -n "$DELETE_PLAN" ] && rm -f "$DELETE_PLAN" + [ -n "$NAME_MAP" ] && rm -f "$NAME_MAP" + [ -n "$FAIL_LOG" ] && rm -f "$FAIL_LOG" + [ -n "$RESULT_LOG" ] && rm -f "$RESULT_LOG" + return 0 +} +trap cleanup EXIT PAGE=1 while :; do page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json" @@ -241,27 +256,75 @@ for l in sys.stdin: fi # --- Execute deletes ------------------------------------------------------- +# +# Parallel delete loop. Was a serial `curl -X DELETE` while-loop; +# at ~0.7s/tunnel that meant 672 stale tunnels needed ~7-8 min, which +# tripped the workflow's 5-min timeout-minutes (run 25248788312, +# cancelled at 424/672). Fan out to $SWEEP_CONCURRENCY workers via +# xargs so a 600+ backlog drains in ~60s. +# +# Design notes: +# - Materialize the (id, name) plan to a tempfile for stdin'ing into +# xargs. xargs `-a FILE` is GNU-only; piping/`<` is portable to +# macOS/BSD xargs (matters for local testing). +# - Pass ONLY the id on argv. xargs tokenizes on whitespace by +# default; tab-separating id+name on argv risks mangling. We keep +# the name in a side-channel id→name map ($NAME_MAP) for failure +# log readability, and the worker also writes failure detail to +# $FAIL_LOG (`FAIL `) for grep-ability. +# - Workers print exactly `OK` or `FAIL` on stdout (one line per +# invocation); we tally with `grep -c '^OK$' / '^FAIL$'`. + +CONCURRENCY="${SWEEP_CONCURRENCY:-8}" +DELETE_PLAN=$(mktemp -t cf-tunnels-plan-XXXXXX) +NAME_MAP=$(mktemp -t cf-tunnels-names-XXXXXX) +FAIL_LOG=$(mktemp -t cf-tunnels-fail-XXXXXX) +RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX) + +# Build delete plan (just ids, one per line) and the side-channel +# id→name map (tab-separated). +echo "$DECISIONS" | python3 -c ' +import json, os, sys +plan_path = sys.argv[1] +map_path = sys.argv[2] +with open(plan_path, "w") as plan, open(map_path, "w") as nmap: + for line in sys.stdin: + d = json.loads(line) + if d.get("action") != "delete": + continue + tid = d["id"] + name = d.get("name", "") + plan.write(tid + "\n") + nmap.write(tid + "\t" + name + "\n") +' "$DELETE_PLAN" "$NAME_MAP" log "" -log "Executing $DELETE_COUNT deletions..." -DELETED=0 -FAILED=0 -while IFS= read -r line; do - action=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['action'])") - [ "$action" = "delete" ] || continue - tid=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['id'])") - name=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['name'])") - if curl -sS -m 10 -X DELETE \ - -H "Authorization: Bearer $CF_API_TOKEN" \ - "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$tid" \ - | grep -q '"success":true'; then - DELETED=$((DELETED+1)) +log "Executing $DELETE_COUNT deletions ($CONCURRENCY-way parallel)..." + +export CF_API_TOKEN CF_ACCOUNT_ID NAME_MAP FAIL_LOG + +# shellcheck disable=SC2016 +xargs -P "$CONCURRENCY" -L 1 -I {} bash -c ' + tid="$1" + resp=$(curl -sS -m 10 -X DELETE \ + -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$tid") + if printf "%s" "$resp" | grep -q "\"success\":true"; then + echo OK else - FAILED=$((FAILED+1)) - log " FAILED: $name ($tid)" + name=$(awk -F"\t" -v id="$tid" "\$1==id {print \$2; exit}" "$NAME_MAP") + echo FAIL + echo "FAIL $name $tid" >> "$FAIL_LOG" fi -done <<< "$DECISIONS" +' _ {} < "$DELETE_PLAN" > "$RESULT_LOG" + +DELETED=$(grep -c '^OK$' "$RESULT_LOG" || true) +FAILED=$(grep -c '^FAIL$' "$RESULT_LOG" || true) log "" log "Done. deleted=$DELETED failed=$FAILED" +if [ "$FAILED" -ne 0 ]; then + log "Failure detail (first 20):" + head -20 "$FAIL_LOG" | while IFS= read -r fl; do log " $fl"; done +fi [ "$FAILED" -eq 0 ]