Merge branch 'staging' into auto-sync/main-35cb6ba0

This commit is contained in:
Hongming Wang 2026-05-02 03:39:00 -07:00 committed by GitHub
commit 8083fd8b7d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 134 additions and 69 deletions

View File

@ -47,10 +47,22 @@ jobs:
sweep:
name: Sweep CF tunnels
runs-on: ubuntu-latest
# 5 min surfaces hangs (CF API stall, slow pagination on busy
# accounts). Realistic worst case is ~3 min: 2 CP curls + N CF
# list pages + N×CF-DELETE, each capped at 10-15s by curl -m.
timeout-minutes: 5
# 30 min cap. Was 5 min on the theory that the only thing that
# could take >5min is a CF-API hang — but on 2026-05-02 a backlog
# of 672 stale tunnels accumulated (large staging E2E run + delayed
# sweep) and the serial `curl -X DELETE` loop (~0.7s/tunnel) needed
# ~7-8min to drain. The 5-min cap killed the run mid-sweep
# (cancelled at 424/672, see run 25248788312); a manual rerun
# finished the remainder fine.
#
# The fix is two-part: parallelize the delete loop (8-way xargs in
# the script — see scripts/ops/sweep-cf-tunnels.sh), AND raise the
# cap so a one-off backlog doesn't trip a hangs-detector that
# turned out to be a real-job-too-slow detector. With 8-way
# parallelism, 600+ tunnels drains in ~60s; 30 min is generous
# headroom for actual hangs to still surface (and is in line with
# the sweep-cf-orphans companion job).
timeout-minutes: 30
env:
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}

View File

@ -32,15 +32,11 @@ interface A2AFileRef {
bytes?: string;
size?: number;
}
// A2A Part — outbound matches the v0 Pydantic discriminated-union
// shape that a2a-sdk's JSON-RPC layer validates against (TextPart |
// FilePart | DataPart). The v1 flat-protobuf shape `{url, filename,
// mediaType}` is internal SDK serialization only; sending it on the
// wire fails Pydantic validation with `TextPart.text required,
// FilePart.file required, DataPart.data required` and never reaches
// the executor. Inbound also tolerates the v1 shape via
// message-parser.ts since the agent itself may serialize as v1 in
// some downstream tools.
// Outbound shape matches a2a-sdk's JSON-RPC `SendMessageRequest`
// Pydantic union (TextPart | FilePart | DataPart). The flat
// protobuf shape `{url, filename, mediaType}` is rejected at the
// request boundary with `Field required` errors — keep this
// outbound shape unless a2a-sdk migrates the JSON-RPC schema.
interface A2APart {
kind: string;
text?: string;
@ -511,18 +507,7 @@ function MyChatPanel({ workspaceId, data }: Props) {
// A2A parts: text part (if any) + file parts (per attachment). The
// agent sees both in a single turn, matching the A2A spec shape.
//
// File parts use the v0 discriminated-union shape `{kind:"file",
// file:{...}}` because that's what a2a-sdk's JSON-RPC layer
// validates against (`SendMessageRequest.params.message.parts[]`
// → `TextPart | FilePart | DataPart` Pydantic union). Sending the
// v1 flat shape `{url, filename, mediaType}` returns
// `Invalid Request — TextPart.text required, FilePart.file
// required, DataPart.data required` and the message never
// reaches the executor. v1 protobuf is internal serialization
// only; the wire shape stays v0 until the SDK migrates the
// JSON-RPC schema. Text parts keep `{kind:"text", text}` for the
// same reason.
// Wire shape is v0 — see A2APart definition above.
const parts: A2APart[] = [];
if (text) parts.push({ kind: "text", text });
for (const att of uploaded) {

View File

@ -41,15 +41,15 @@ export interface ParsedFilePart {
/** Extract file parts from an A2A response. Walks parts[] + artifacts[].
*
* Tolerates both A2A protocol generations:
* - v0 (Pydantic): `{ kind: "file", file: { name, mimeType, uri } }`
* - v1 (protobuf): `{ url, filename, mediaType }` flat, no `kind`
* and no nested `file` object (the v1 Part's content oneof is
* `{text, raw, url, data}`; file metadata sits at top level).
* Hot path: v0 Pydantic shape `{ kind: "file", file: { name, mimeType,
* uri } }` — what every current workspace runtime emits.
*
* Without v1 tolerance, agents that emit the v1 shape (every workspace
* runtime since the SDK migration) silently drop file parts in chat
* the agent says "I sent the file" but the user never sees the chip.
* Defensive secondary path: v1 protobuf shape `{ url, filename,
* mediaType }` — flat, no `kind`, no nested `file`. Not currently
* observed on the wire (a2a-sdk's JSON-RPC layer still validates
* against v0), but kept so a future SDK release that flips the wire
* shape, or a third-party agent that round-trips through protobuf
* serialization, doesn't silently lose file chips.
*
* We only surface parts that carry a URL inline bytes would require
* a different renderer (data URL) and are out of scope for MVP. Names

View File

@ -102,7 +102,22 @@ log "Fetching Cloudflare tunnels..."
# `python3: Argument list too long`. Disk-buffering also makes the
# accumulator O(n) instead of O(n^2).
PAGES_DIR=$(mktemp -d -t cf-tunnels-XXXXXX)
trap 'rm -rf "$PAGES_DIR"' EXIT
# Single cleanup() covering all tempfiles created downstream
# ($DELETE_PLAN, $NAME_MAP, $FAIL_LOG, $RESULT_LOG). One trap call so a
# later `trap '...' EXIT` doesn't silently overwrite an earlier one.
DELETE_PLAN=""
NAME_MAP=""
FAIL_LOG=""
RESULT_LOG=""
cleanup() {
rm -rf "$PAGES_DIR"
[ -n "$DELETE_PLAN" ] && rm -f "$DELETE_PLAN"
[ -n "$NAME_MAP" ] && rm -f "$NAME_MAP"
[ -n "$FAIL_LOG" ] && rm -f "$FAIL_LOG"
[ -n "$RESULT_LOG" ] && rm -f "$RESULT_LOG"
return 0
}
trap cleanup EXIT
PAGE=1
while :; do
page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json"
@ -241,27 +256,75 @@ for l in sys.stdin:
fi
# --- Execute deletes -------------------------------------------------------
#
# Parallel delete loop. Was a serial `curl -X DELETE` while-loop;
# at ~0.7s/tunnel that meant 672 stale tunnels needed ~7-8 min, which
# tripped the workflow's 5-min timeout-minutes (run 25248788312,
# cancelled at 424/672). Fan out to $SWEEP_CONCURRENCY workers via
# xargs so a 600+ backlog drains in ~60s.
#
# Design notes:
# - Materialize the (id, name) plan to a tempfile for stdin'ing into
# xargs. xargs `-a FILE` is GNU-only; piping/`<` is portable to
# macOS/BSD xargs (matters for local testing).
# - Pass ONLY the id on argv. xargs tokenizes on whitespace by
# default; tab-separating id+name on argv risks mangling. We keep
# the name in a side-channel id→name map ($NAME_MAP) for failure
# log readability, and the worker also writes failure detail to
# $FAIL_LOG (`FAIL <name> <id>`) for grep-ability.
# - Workers print exactly `OK` or `FAIL` on stdout (one line per
# invocation); we tally with `grep -c '^OK$' / '^FAIL$'`.
CONCURRENCY="${SWEEP_CONCURRENCY:-8}"
DELETE_PLAN=$(mktemp -t cf-tunnels-plan-XXXXXX)
NAME_MAP=$(mktemp -t cf-tunnels-names-XXXXXX)
FAIL_LOG=$(mktemp -t cf-tunnels-fail-XXXXXX)
RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX)
# Build delete plan (just ids, one per line) and the side-channel
# id→name map (tab-separated).
echo "$DECISIONS" | python3 -c '
import json, os, sys
plan_path = sys.argv[1]
map_path = sys.argv[2]
with open(plan_path, "w") as plan, open(map_path, "w") as nmap:
for line in sys.stdin:
d = json.loads(line)
if d.get("action") != "delete":
continue
tid = d["id"]
name = d.get("name", "")
plan.write(tid + "\n")
nmap.write(tid + "\t" + name + "\n")
' "$DELETE_PLAN" "$NAME_MAP"
log ""
log "Executing $DELETE_COUNT deletions..."
DELETED=0
FAILED=0
while IFS= read -r line; do
action=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['action'])")
[ "$action" = "delete" ] || continue
tid=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['id'])")
name=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['name'])")
if curl -sS -m 10 -X DELETE \
-H "Authorization: Bearer $CF_API_TOKEN" \
"https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$tid" \
| grep -q '"success":true'; then
DELETED=$((DELETED+1))
log "Executing $DELETE_COUNT deletions ($CONCURRENCY-way parallel)..."
export CF_API_TOKEN CF_ACCOUNT_ID NAME_MAP FAIL_LOG
# shellcheck disable=SC2016
xargs -P "$CONCURRENCY" -L 1 -I {} bash -c '
tid="$1"
resp=$(curl -sS -m 10 -X DELETE \
-H "Authorization: Bearer $CF_API_TOKEN" \
"https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$tid")
if printf "%s" "$resp" | grep -q "\"success\":true"; then
echo OK
else
FAILED=$((FAILED+1))
log " FAILED: $name ($tid)"
name=$(awk -F"\t" -v id="$tid" "\$1==id {print \$2; exit}" "$NAME_MAP")
echo FAIL
echo "FAIL $name $tid" >> "$FAIL_LOG"
fi
done <<< "$DECISIONS"
' _ {} < "$DELETE_PLAN" > "$RESULT_LOG"
DELETED=$(grep -c '^OK$' "$RESULT_LOG" || true)
FAILED=$(grep -c '^FAIL$' "$RESULT_LOG" || true)
log ""
log "Done. deleted=$DELETED failed=$FAILED"
if [ "$FAILED" -ne 0 ]; then
log "Failure detail (first 20):"
head -20 "$FAIL_LOG" | while IFS= read -r fl; do log " $fl"; done
fi
[ "$FAILED" -eq 0 ]

View File

@ -844,23 +844,27 @@ def resolve_attachment_uri(uri: str) -> str | None:
def extract_attached_files(message: Any) -> list[dict[str, str]]:
"""Pull ``{name, mime_type, path}`` dicts out of an A2A message.
Tolerates three Part shapes seen in the wild:
Tolerates three Part shapes:
1. a2a-sdk v0 Pydantic RootModel ``part.root.kind == 'file'`` with
``part.root.file.{uri,name,mimeType}``.
2. a2a-sdk v0 flatter shape ``part.kind == 'file'`` with
``part.file.{uri,name,mimeType}`` (some hand-built callers).
3. a2a-sdk v1 protobuf ``part.url`` non-empty with
``part.filename`` + ``part.media_type``. The v1 ``Part`` proto
has no ``kind`` field at all (the discriminator is now a oneof
``content`` of {text, raw, url, data}). Without this branch a v1
file part which is what a v1 server constructs from any caller
that JSON-encodes the v1 shape silently parses to an empty
Part on the v0v1 transition because protobuf json_format with
``ignore_unknown_fields=True`` drops the legacy ``kind`` and
``file`` keys, surfacing as the user-visible
"Error: message contained no text content" on image-only chats
(2026-05-01 hongming incident).
``part.root.file.{uri,name,mimeType}``. The hot path; this is
what every current caller produces (canvas chat, A2A peer
delegations, agent self-attached files).
2. v0 flatter shape ``part.kind == 'file'`` with
``part.file.{uri,name,mimeType}``. Some hand-built callers
(older test fixtures, third-party clients) emit this.
3. v1 protobuf ``part.url`` non-empty with ``part.filename`` +
``part.media_type``. **Defensive future-proofing only.** The
v1 ``Part`` proto exists in a2a-sdk's ``a2a.types.a2a_pb2`` but
a2a-sdk's JSON-RPC layer still validates inbound requests
against the v0 Pydantic discriminated union (TextPart |
FilePart | DataPart), so a v1 wire shape is rejected at the
request boundary today this branch is unreachable on the
JSON-RPC ingress path. Kept so a future SDK release that
flips the JSON-RPC schema doesn't silently regress this
helper, and so non-conformant in-process callers (e.g. a
template that constructs a Part directly from protobuf) get
handled correctly.
Non-file parts and files with unresolvable URIs are skipped the
caller sees an empty list rather than a mix of valid and broken
@ -884,10 +888,11 @@ def extract_attached_files(message: Any) -> list[dict[str, str]]:
name = getattr(f, "name", "") or ""
mime = getattr(f, "mimeType", None) or getattr(f, "mime_type", None) or ""
else:
# v1 protobuf Part has no `kind`; detect by a non-empty
# `url` (the file/url-of-bytes oneof slot). Fall back to
# `media_type` then `mimeType` for the camelCase Pydantic
# variant some adapters still hand us.
# Defensive v1 path (see docstring): v1 Part has no `kind`,
# detect by a non-empty `url` (the file/url-of-bytes oneof
# slot). Fall back from snake_case `media_type` to
# camelCase `mediaType` for callers that hand us the
# Pydantic-style attribute name.
v1_url = getattr(part, "url", "") or ""
if not v1_url:
continue