Merge pull request 'fix(a2a): avoid false failure on busy queue fallback' (#1751) from fix/codex-scheduled-a2a-timeout into main
ci-arm64-advisory / fast-checks (push) Waiting to run
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (push) Successful in 10s
publish-workspace-server-image / build-and-push (push) Successful in 3m15s
CI / Detect changes (push) Successful in 9s
Block internal-flavored paths / Block forbidden paths (push) Successful in 9s
CI / Python Lint & Test (push) Successful in 5s
E2E API Smoke Test / detect-changes (push) Successful in 7s
E2E Staging Canvas (Playwright) / detect-changes (push) Successful in 8s
E2E Chat / detect-changes (push) Successful in 9s
Handlers Postgres Integration / detect-changes (push) Successful in 4s
Harness Replays / detect-changes (push) Successful in 9s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (push) Successful in 4s
Lint no tenant GITEA or GITHUB token write / Scan for repo-host token write into tenant workspace surface (push) Successful in 3s
Secret scan / Scan diff for credential-shaped strings (push) Successful in 3s
Sweep stale e2e-* orgs (staging) / Sweep e2e orgs (push) Successful in 3s
E2E Staging SaaS (full lifecycle) / pr-validate (push) Successful in 38s
E2E Staging SaaS (full lifecycle) / E2E Staging SaaS (push) Successful in 5m45s
CI / Canvas (Next.js) (push) Successful in 8s
CI / Shellcheck (E2E scripts) (push) Successful in 3s
E2E API Smoke Test / E2E API Smoke Test (push) Successful in 1m38s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (push) Successful in 4s
Handlers Postgres Integration / Handlers Postgres Integration (push) Successful in 1m56s
CI / Platform (Go) (push) Successful in 5m17s
E2E Chat / E2E Chat (push) Failing after 3m30s
CI / all-required (push) Successful in 22m26s
Harness Replays / Harness Replays (push) Successful in 2s
publish-workspace-server-image / Production auto-deploy (push) Successful in 24m55s
CI / Canvas Deploy Reminder (push) Successful in 3s
gate-check-v3 / gate-check (push) Successful in 29s
main-red-watchdog / watchdog (push) Successful in 2m20s
Sweep stale Cloudflare DNS records / Sweep CF orphans (push) Successful in 9s
ci-required-drift / drift (push) Successful in 1m11s
Sweep stale AWS Secrets Manager secrets / Sweep AWS Secrets Manager (push) Successful in 10s
Sweep stale Cloudflare Tunnels / Sweep CF tunnels (push) Successful in 6s
Staging SaaS smoke (every 30 min) / Staging SaaS smoke (push) Successful in 5m7s
Continuous synthetic E2E (staging) / Synthetic E2E against staging (push) Successful in 6m17s

Merge PR #1751: fix(a2a): avoid false failure on busy queue fallback
This commit was merged in pull request #1751.
This commit is contained in:
2026-05-23 23:29:00 +00:00
3 changed files with 100 additions and 11 deletions
@@ -111,12 +111,13 @@ const maxProxyResponseBody = 10 << 20
// a generic 502 page to canvas. 10s is well above realistic intra-region
// latencies and well below CF's edge timeout.
//
// 3. Transport.ResponseHeaderTimeout — 180s default. From request-body-end
// 3. Transport.ResponseHeaderTimeout — 5min default. From request-body-end
// to response-headers-start. Configurable via
// A2A_PROXY_RESPONSE_HEADER_TIMEOUT (envx.Duration). Covers cold-start
// first-byte (30-60s OAuth flow above) with enough room for Opus agent
// turns (big context + internal delegate_task round-trips routinely exceed
// the old 60s ceiling). Body streaming after headers is governed by the
// turns and Codex scheduled tasks (big context + internal delegate_task
// round-trips routinely exceed the old 60s/180s ceilings). Body streaming
// after headers is governed by the
// per-request context deadline, NOT this timeout — so multi-minute agent
// responses still work fine.
//
@@ -131,7 +132,7 @@ var a2aClient = &http.Client{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
ResponseHeaderTimeout: envx.Duration("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", 180*time.Second),
ResponseHeaderTimeout: envx.Duration("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", 5*time.Minute),
TLSHandshakeTimeout: 10 * time.Second,
// MaxIdleConns / IdleConnTimeout: stdlib defaults are fine; agent
// fan-in is bounded by the platform's broadcaster fan-out, not by
@@ -28,8 +28,8 @@ type proxyDispatchBuildError struct{ err error }
func (e *proxyDispatchBuildError) Error() string { return e.err.Error() }
// handleA2ADispatchError translates a forward-call failure into a proxyA2AError,
// runs the reactive container-health check, and (when `logActivity` is true)
// schedules a detached LogActivity goroutine for the failed attempt.
// runs the reactive container-health check, and records the outcome. Busy
// targets that are successfully queued are logged as queued, not failed.
func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, err error, durationMs int, logActivity bool) (int, []byte, *proxyA2AError) {
// Build-time failure (couldn't even create the http.Request) — return
// a 500 without the reactive-health / busy-retry paths.
@@ -45,10 +45,10 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
containerDead := h.maybeMarkContainerDead(ctx, workspaceID)
if logActivity {
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
}
if containerDead {
if logActivity {
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
}
return 0, nil, &proxyA2AError{
Status: http.StatusServiceUnavailable,
Response: gin.H{"error": "workspace agent unreachable — container restart triggered", "restarting": true},
@@ -108,6 +108,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey, expiresAt,
); qerr == nil {
log.Printf("ProxyA2A: target %s busy — enqueued as %s (depth=%d)", workspaceID, qid, depth)
if logActivity {
h.logA2ABusyQueued(ctx, workspaceID, callerID, body, a2aMethod, durationMs)
}
respBody, _ := json.Marshal(gin.H{
"queued": true,
"queue_id": qid,
@@ -121,6 +124,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
// make delegation silently disappear.
log.Printf("ProxyA2A: enqueue for %s failed (%v) — falling back to 503", workspaceID, qerr)
}
if logActivity {
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
}
return 0, nil, &proxyA2AError{
Status: http.StatusServiceUnavailable,
Headers: map[string]string{"Retry-After": strconv.Itoa(busyRetryAfterSeconds)},
@@ -131,6 +137,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
},
}
}
if logActivity {
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
}
return 0, nil, &proxyA2AError{
Status: http.StatusBadGateway,
Response: gin.H{"error": "failed to reach workspace agent"},
@@ -311,6 +320,33 @@ func (h *WorkspaceHandler) logA2AFailure(ctx context.Context, workspaceID, calle
})
}
// logA2ABusyQueued records that a push attempt reached a live but busy
// workspace and was durably queued for heartbeat drain.
func (h *WorkspaceHandler) logA2ABusyQueued(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, durationMs int) {
var wsName string
db.DB.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName)
if wsName == "" {
wsName = workspaceID
}
summary := a2aMethod + " → " + wsName + " (queued: target busy)"
parent := ctx
h.goAsync(func() {
logCtx, cancel := context.WithTimeout(context.WithoutCancel(parent), 30*time.Second)
defer cancel()
LogActivity(logCtx, h.broadcaster, ActivityParams{
WorkspaceID: workspaceID,
ActivityType: "a2a_receive",
SourceID: nilIfEmpty(callerID),
TargetID: &workspaceID,
Method: &a2aMethod,
Summary: &summary,
RequestBody: json.RawMessage(body),
DurationMs: &durationMs,
Status: "ok",
})
})
}
// logA2ASuccess records a successful A2A round-trip and (for canvas-initiated
// 2xx/3xx responses) broadcasts an A2A_RESPONSE event so the frontend can
// receive the reply without polling.
@@ -1779,6 +1779,58 @@ func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {
}
}
func TestHandleA2ADispatchError_BusyEnqueueLogsQueuedNotFailure(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
waitForHandlerAsyncBeforeDBCleanup(t, handler)
mock.ExpectQuery(`INSERT INTO a2a_queue`).
WithArgs("ws-busy", nil, PriorityTask, "{}", "message/send", nil, nil).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("11111111-1111-1111-1111-111111111111"))
mock.ExpectQuery(`SELECT COUNT\(\*\) FROM a2a_queue`).
WithArgs("ws-busy").
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
mock.ExpectQuery(`SELECT name FROM workspaces WHERE id =`).
WithArgs("ws-busy").
WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("Busy Target"))
mock.ExpectExec("INSERT INTO activity_logs").
WithArgs(
"ws-busy",
"a2a_receive",
nil,
sqlmock.AnyArg(),
sqlmock.AnyArg(),
sqlmock.AnyArg(),
sqlmock.AnyArg(),
nil,
nil,
sqlmock.AnyArg(),
"ok",
nil,
).
WillReturnResult(sqlmock.NewResult(0, 1))
status, body, perr := handler.handleA2ADispatchError(
context.Background(), "ws-busy", "", []byte("{}"), "message/send",
context.DeadlineExceeded, 180002, true,
)
if perr != nil {
t.Fatalf("expected busy enqueue success, got proxy error: %+v", perr)
}
if status != http.StatusAccepted {
t.Fatalf("got status %d, want 202", status)
}
if !bytes.Contains(body, []byte(`"queued":true`)) {
t.Fatalf("expected queued response body, got %s", string(body))
}
time.Sleep(80 * time.Millisecond)
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations; busy enqueue must log status=ok, not error: %v", err)
}
}
func TestHandleA2ADispatchError_BuildError(t *testing.T) {
setupTestDB(t)
setupTestRedis(t)
@@ -2354,7 +2406,7 @@ func TestLookupDeliveryMode_ContextCanceled_FailsClosed(t *testing.T) {
// ==================== a2aClient ResponseHeaderTimeout config ====================
func TestA2AClientResponseHeaderTimeout(t *testing.T) {
const defaultTimeout = 180 * time.Second
const defaultTimeout = 5 * time.Minute
// Default (unset env) — a2aClient was initialised at package load time.
if a2aClient.Transport.(*http.Transport).ResponseHeaderTimeout != defaultTimeout {
@@ -2378,7 +2430,7 @@ func TestA2AClientResponseHeaderTimeout(t *testing.T) {
t.Run("invalid A2A_PROXY_RESPONSE_HEADER_TIMEOUT falls back to default", func(t *testing.T) {
t.Setenv("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", "not-a-duration")
// Simulate what envx.Duration does with an invalid value.
var fallback = 180 * time.Second
var fallback = 5 * time.Minute
override := fallback
if v := os.Getenv("A2A_PROXY_RESPONSE_HEADER_TIMEOUT"); v != "" {
if d, err := time.ParseDuration(v); err == nil && d > 0 {